Changes from Mike Brown.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5278 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
pscrozi
2010-11-23 00:41:14 +00:00
parent 5a82c99485
commit 4e52d8d927
18 changed files with 3104 additions and 536 deletions

View File

@ -1,6 +1,7 @@
# Install/unInstall package files in LAMMPS # Install/unInstall package files in LAMMPS
# edit Makefile.package to include/exclude GPU library # edit Makefile.package to include/exclude GPU library
# do not copy gayberne files if non-GPU version does not exist # do not copy gayberne files if non-GPU version does not exist
# do not copy charmm files if non-GPU version does not exist
if (test $1 = 1) then if (test $1 = 1) then
@ -12,14 +13,36 @@ if (test $1 = 1) then
sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(gpu_SYSPATH) |' ../Makefile.package sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(gpu_SYSPATH) |' ../Makefile.package
sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(gpu_SYSLIB) |' ../Makefile.package sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(gpu_SYSLIB) |' ../Makefile.package
fi fi
if (test -e ../pair_gayberne.cpp) then if (test -e ../pair_gayberne.cpp) then
cp pair_gayberne_gpu.cpp .. cp pair_gayberne_gpu.cpp ..
cp pair_gayberne_gpu.h .. cp pair_gayberne_gpu.h ..
fi fi
if (test -e ../pair_lj_cut_coul_long.cpp) then
cp pair_lj_cut_coul_long_gpu.cpp ..
cp pair_lj_cut_coul_long_gpu.h ..
fi
if (test -e ../pair_cg_cmm.cpp) then
cp pair_cg_cmm_gpu.cpp ..
cp pair_cg_cmm_gpu.h ..
fi
if (test -e ../pair_cg_cmm_coul_long.cpp) then
cp pair_cg_cmm_coul_long_gpu.cpp ..
cp pair_cg_cmm_coul_long_gpu.h ..
fi
cp pair_lj_cut_gpu.cpp .. cp pair_lj_cut_gpu.cpp ..
cp pair_lj96_cut_gpu.cpp ..
cp pair_lj_cut_coul_cut_gpu.cpp ..
cp pair_lj_cut_gpu.h .. cp pair_lj_cut_gpu.h ..
cp pair_lj96_cut_gpu.h ..
cp pair_lj_cut_coul_cut_gpu.h ..
cp fix_gpu.cpp ..
cp fix_gpu.h ..
elif (test $1 = 0) then elif (test $1 = 0) then
@ -27,11 +50,23 @@ elif (test $1 = 0) then
sed -i -e 's/[^ \t]*gpu //' ../Makefile.package sed -i -e 's/[^ \t]*gpu //' ../Makefile.package
sed -i -e 's/[^ \t]*gpu_[^ \t]*) //' ../Makefile.package sed -i -e 's/[^ \t]*gpu_[^ \t]*) //' ../Makefile.package
fi fi
rm ../pair_gayberne_gpu.cpp rm ../pair_gayberne_gpu.cpp
rm ../pair_lj_cut_gpu.cpp rm ../pair_lj_cut_gpu.cpp
rm ../pair_lj96_cut_gpu.cpp
rm ../pair_lj_cut_coul_cut_gpu.cpp
rm ../pair_lj_cut_coul_long_gpu.cpp
rm ../pair_cg_cmm_gpu.cpp
rm ../pair_cg_cmm_coul_long_gpu.cpp
rm ../fix_gpu.cpp
rm ../pair_gayberne_gpu.h rm ../pair_gayberne_gpu.h
rm ../pair_lj_cut_gpu.h rm ../pair_lj_cut_gpu.h
rm ../pair_lj96_cut_gpu.h
rm ../pair_lj_cut_coul_cut_gpu.h
rm ../pair_lj_cut_coul_long_gpu.h
rm ../pair_cg_cmm_gpu.h
rm ../pair_cg_cmm_coul_long_gpu.h
rm ../fix_gpu.h
fi fi

View File

@ -3,10 +3,40 @@
# do not copy gayberne files if non-GPU version does not exist # do not copy gayberne files if non-GPU version does not exist
for file in *.cpp *.h; do for file in *.cpp *.h; do
if (test $file == pair_gayberne_gpu.cpp -a ! -e ../pair_gayberne.cpp) then if (test $file = pair_gayberne_gpu.cpp -a ! -e ../pair_gayberne.cpp) then
continue continue
fi fi
if (test $file == pair_gayberne_gpu.h -a ! -e ../pair_gayberne.cpp) then if (test $file = pair_gayberne_gpu.h -a ! -e ../pair_gayberne.cpp) then
continue
fi
if (test $file = pair_lj_cut_coul_long_gpu.cpp -a ! -e ../pair_lj_cut_coul_long.cpp) then
continue
fi
if (test $file = pair_lj_cut_coul_long_gpu.h -a ! -e ../pair_lj_cut_coul_long.cpp) then
continue
fi
if (test $file = pair_cg_cmm_gpu.cpp -a ! -e ../pair_cg_cmm.cpp) then
continue
fi
if (test $file = pair_cg_cmm_gpu.h -a ! -e ../pair_cg_cmm.cpp) then
continue
fi
if (test $file = pair_cg_cmm_coul_long_gpu.cpp -a ! -e ../pair_cg_cmm_coul_long.cpp) then
continue
fi
if (test $file = pair_cg_cmm_coul_long_gpu.h -a ! -e ../pair_cg_cmm_coul_long.cpp) then
continue
fi
if (test $file = pair_cg_cmm_coul_msm.cpp -a ! -e ../pair_cg_cmm.cpp) then
continue
fi
if (test $file = pair_cg_cmm_coul_msm.h -a ! -e ../pair_cg_cmm.cpp) then
continue
fi
if (test $file = pair_cg_cmm_coul_msm_gpu.cpp -a ! -e ../pair_cg_cmm.cpp) then
continue
fi
if (test $file = pair_cg_cmm_coul_msm_gpu.h -a ! -e ../pair_cg_cmm.cpp) then
continue continue
fi fi

148
src/GPU/fix_gpu.cpp Normal file
View File

@ -0,0 +1,148 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#include "string.h"
#include "stdlib.h"
#include "fix_gpu.h"
#include "atom.h"
#include "force.h"
#include "pair.h"
#include "respa.h"
#include "input.h"
#include "error.h"
#include "timer.h"
#include "modify.h"
#include "domain.h"
using namespace LAMMPS_NS;
enum{GPU_FORCE, GPU_NEIGH};
extern bool lmp_init_device(const int first_gpu, const int last_gpu,
const int gpu_mode, const double particle_split);
extern void lmp_clear_device();
extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul);
/* ---------------------------------------------------------------------- */
FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
Fix(lmp, narg, arg)
{
if (narg < 7) error->all("Illegal fix gpu command");
if (strcmp(arg[1],"all") != 0)
error->all("Illegal fix gpu command");
int gpu_mode, first_gpu, last_gpu;
double particle_split;
if (strcmp(arg[3],"force") == 0)
gpu_mode = GPU_FORCE;
else if (strcmp(arg[3],"force/neigh") == 0) {
gpu_mode = GPU_NEIGH;
if (domain->triclinic)
error->all("Cannot use force/neigh with triclinic box.");
} else
error->all("Illegal fix gpu command.");
first_gpu = atoi(arg[4]);
last_gpu = atoi(arg[5]);
particle_split = force->numeric(arg[6]);
if (particle_split==0 || particle_split>1)
error->all("Illegal fix gpu command.");
if (!lmp_init_device(first_gpu,last_gpu,gpu_mode,particle_split))
error->one("Could not find or initialize a specified accelerator device.");
}
/* ---------------------------------------------------------------------- */
FixGPU::~FixGPU()
{
lmp_clear_device();
}
/* ---------------------------------------------------------------------- */
int FixGPU::setmask()
{
int mask = 0;
mask |= POST_FORCE;
mask |= MIN_POST_FORCE;
return mask;
}
/* ---------------------------------------------------------------------- */
void FixGPU::init()
{
// Can only have 1 gpu fix that must be the first fix for a run
if ((void*)modify->fix[0] != (void*)this)
error->all("GPU is not the first fix for this run.");
}
/* ---------------------------------------------------------------------- */
void FixGPU::setup(int vflag)
{
post_force(vflag);
}
/* ---------------------------------------------------------------------- */
void FixGPU::min_setup(int vflag)
{
post_force(vflag);
}
/* ---------------------------------------------------------------------- */
void FixGPU::post_force(int vflag)
{
timer->stamp();
double lvirial[6];
for (int i = 0; i < 6; i++) lvirial[i] = 0.0;
double my_eng = lmp_gpu_forces(atom->f, atom->torque, force->pair->eatom,
force->pair->vatom, lvirial,
force->pair->eng_coul);
force->pair->eng_vdwl += my_eng;
force->pair->virial[0] += lvirial[0];
force->pair->virial[1] += lvirial[1];
force->pair->virial[2] += lvirial[2];
force->pair->virial[3] += lvirial[3];
force->pair->virial[4] += lvirial[4];
force->pair->virial[5] += lvirial[5];
if (force->pair->vflag_fdotr) force->pair->virial_compute();
timer->stamp(TIME_PAIR);
}
/* ---------------------------------------------------------------------- */
void FixGPU::min_post_force(int vflag)
{
post_force(vflag);
}
/* ---------------------------------------------------------------------- */
double FixGPU::memory_usage()
{
double bytes = 0.0;
// Memory usage currently returned by pair routine
return bytes;
}

45
src/GPU/fix_gpu.h Normal file
View File

@ -0,0 +1,45 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef FIX_CLASS
FixStyle(gpu,FixGPU)
#else
#ifndef LMP_FIX_GPU_H
#define LMP_FIX_GPU_H
#include "fix.h"
namespace LAMMPS_NS {
class FixGPU : public Fix {
public:
FixGPU(class LAMMPS *, int, char **);
~FixGPU();
int setmask();
void init();
void setup(int);
void min_setup(int);
void post_force(int);
void min_post_force(int);
double memory_usage();
private:
};
}
#endif
#endif

View File

@ -0,0 +1,499 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "pair_cg_cmm_coul_long_gpu.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#include "update.h"
#include "domain.h"
#include "string.h"
#include "kspace.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define EWALD_F 1.12837917
#define EWALD_P 0.3275911
#define A1 0.254829592
#define A2 -0.284496736
#define A3 1.421413741
#define A4 -1.453152027
#define A5 1.061405429
// External functions from cuda library for atom decomposition
bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald);
void cmml_gpu_clear();
int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q);
void cmml_gpu_compute(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q);
double cmml_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairCGCMMCoulLongGPU::PairCGCMMCoulLongGPU(LAMMPS *lmp) : PairCGCMMCoulLong(lmp), gpu_mode(GPU_PAIR)
{
respa_enable = 0;
cpu_time = 0.0;
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairCGCMMCoulLongGPU::~PairCGCMMCoulLongGPU()
{
cmml_gpu_clear();
}
/* ---------------------------------------------------------------------- */
void PairCGCMMCoulLongGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = cmml_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success,
atom->q);
} else {
inum = list->inum;
cmml_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh, list->firstneigh,
eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
success, atom->q);
}
if (!success)
error->one("Out of memory on GPGPU");
if (host_start<inum) {
cpu_time = MPI_Wtime();
if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist, host_start, eflag, vflag);
else
cpu_compute(host_start, eflag, vflag);
cpu_time = MPI_Wtime() - cpu_time;
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairCGCMMCoulLongGPU::init_style()
{
cut_respa = NULL;
if (!atom->q_flag)
error->all("Pair style cg/cmm/coul/long requires atom attribute q");
if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles");
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
double cell_size = sqrt(maxcut) + neighbor->skin;
// insure use of KSpace long-range solver, set g_ewald
if (force->kspace == NULL)
error->all("Pair style is incompatible with KSpace style");
g_ewald = force->kspace->g_ewald;
// setup force tables
if (ncoultablebits) init_tables();
int maxspecial=0;
if (atom->molecular)
maxspecial=atom->maxspecial;
bool init_ok = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3,
lj4, offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen, cut_ljsq,
cut_coulsq_global, force->special_coul,
force->qqrd2e, g_ewald);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).\n");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU cg/cmm pair style");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
}
}
/* ---------------------------------------------------------------------- */
double PairCGCMMCoulLongGPU::memory_usage()
{
double bytes = Pair::memory_usage();
return bytes + cmml_gpu_bytes();
}
/* ---------------------------------------------------------------------- */
void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
{
int i,j,ii,jj,inum,jnum,itype,jtype,itable;
double qtmp,xtmp,ytmp,ztmp,delx,dely,delz;
double fraction,table;
double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
double grij,expm2,prefactor,t,erfc;
int *ilist,*jlist,*numneigh,**firstneigh;
double rsq;
double **x = atom->x;
double **f = atom->f;
double *q = atom->q;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_coul = force->special_coul;
double *special_lj = force->special_lj;
double qqrd2e = force->qqrd2e;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
qtmp = q[i];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_coul = factor_lj = 1.0;
else {
factor_coul = special_coul[j/nall];
factor_lj = special_lj[j/nall];
j %= nall;
}
const double delx = xtmp - x[j][0];
const double dely = ytmp - x[j][1];
const double delz = ztmp - x[j][2];
const double rsq = delx*delx + dely*dely + delz*delz;
const int jtype = type[j];
double evdwl = 0.0;
double ecoul = 0.0;
double fpair = 0.0;
if (rsq < cutsq[itype][jtype]) {
const double r2inv = 1.0/rsq;
const int cgt=cg_type[itype][jtype];
double forcelj = 0.0;
double forcecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
forcelj=factor_lj;
if (eflag) evdwl=factor_lj;
if (cgt == CG_LJ12_4) {
const double r4inv=r2inv*r2inv;
forcelj *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else if (cgt == CG_LJ9_6) {
const double r3inv = r2inv*sqrt(r2inv);
const double r6inv = r3inv*r3inv;
forcelj *= r6inv*(lj1[itype][jtype]*r3inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r3inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else {
const double r6inv = r2inv*r2inv*r2inv;
forcelj *= r6inv*(lj1[itype][jtype]*r6inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r6inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
}
}
if (rsq < cut_coulsq_global) {
if (!ncoultablebits || rsq <= tabinnersq) {
const double r = sqrt(rsq);
const double grij = g_ewald * r;
const double expm2 = exp(-grij*grij);
const double t = 1.0 / (1.0 + EWALD_P*grij);
const double erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
const double prefactor = qqrd2e * qtmp*q[j]/r;
forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
if (eflag) ecoul = prefactor*erfc;
if (factor_coul < 1.0) {
forcecoul -= (1.0-factor_coul)*prefactor;
if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
}
} else {
union_int_float_t rsq_lookup;
rsq_lookup.f = rsq;
int itable = rsq_lookup.i & ncoulmask;
itable >>= ncoulshiftbits;
const double fraction = (rsq_lookup.f - rtable[itable]) *
drtable[itable];
const double table = ftable[itable] + fraction*dftable[itable];
forcecoul = qtmp*q[j] * table;
if (eflag) {
const double table2 = etable[itable] + fraction*detable[itable];
ecoul = qtmp*q[j] * table2;
}
if (factor_coul < 1.0) {
const double table2 = ctable[itable] + fraction*dctable[itable];
const double prefactor = qtmp*q[j] * table2;
forcecoul -= (1.0-factor_coul)*prefactor;
if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
}
}
}
fpair = (forcecoul + forcelj) * r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairCGCMMCoulLongGPU::cpu_compute(int *nbors, int start, int eflag,
int vflag)
{
int i,j,jnum,itype,jtype,itable;
double qtmp,xtmp,ytmp,ztmp,delx,dely,delz;
double fraction,table;
double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
double grij,expm2,prefactor,t,erfc;
double rsq;
double **x = atom->x;
double **f = atom->f;
double *q = atom->q;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double *special_coul = force->special_coul;
double *special_lj = force->special_lj;
double qqrd2e = force->qqrd2e;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
qtmp = q[i];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
int *nbor = nbors + i - start;
jnum = *nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for (; nbor<nbor_end; nbor+=stride) {
j = *nbor;
if (j < nall) factor_coul = factor_lj = 1.0;
else {
factor_coul = special_coul[j/nall];
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
double evdwl = 0.0;
double ecoul = 0.0;
double fpair = 0.0;
if (rsq < cutsq[itype][jtype]) {
const double r2inv = 1.0/rsq;
const int cgt=cg_type[itype][jtype];
double forcelj = 0.0;
double forcecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
forcelj=factor_lj;
if (eflag) evdwl=factor_lj;
if (cgt == CG_LJ12_4) {
const double r4inv=r2inv*r2inv;
forcelj *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else if (cgt == CG_LJ9_6) {
const double r3inv = r2inv*sqrt(r2inv);
const double r6inv = r3inv*r3inv;
forcelj *= r6inv*(lj1[itype][jtype]*r3inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r3inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else {
const double r6inv = r2inv*r2inv*r2inv;
forcelj *= r6inv*(lj1[itype][jtype]*r6inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r6inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
}
}
if (rsq < cut_coulsq_global) {
if (!ncoultablebits || rsq <= tabinnersq) {
const double r = sqrt(rsq);
const double grij = g_ewald * r;
const double expm2 = exp(-grij*grij);
const double t = 1.0 / (1.0 + EWALD_P*grij);
const double erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
const double prefactor = qqrd2e * qtmp*q[j]/r;
forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
if (eflag) ecoul = prefactor*erfc;
if (factor_coul < 1.0) {
forcecoul -= (1.0-factor_coul)*prefactor;
if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
}
} else {
union_int_float_t rsq_lookup;
rsq_lookup.f = rsq;
int itable = rsq_lookup.i & ncoulmask;
itable >>= ncoulshiftbits;
const double fraction = (rsq_lookup.f - rtable[itable]) *
drtable[itable];
const double table = ftable[itable] + fraction*dftable[itable];
forcecoul = qtmp*q[j] * table;
if (eflag) {
const double table2 = etable[itable] + fraction*detable[itable];
ecoul = qtmp*q[j] * table2;
}
if (factor_coul < 1.0) {
const double table2 = ctable[itable] + fraction*dctable[itable];
const double prefactor = qtmp*q[j] * table2;
forcecoul -= (1.0-factor_coul)*prefactor;
if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
}
}
}
fpair = (forcecoul + forcelj) * r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (j<start) {
if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
} else {
if (j<nlocal) {
f[j][0] -= delx*fpair;
f[j][1] -= dely*fpair;
f[j][2] -= delz*fpair;
}
if (evflag) ev_tally(i,j,nlocal,0,
evdwl,ecoul,fpair,delx,dely,delz);
}
}
}
}
}

View File

@ -0,0 +1,48 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(cg/cmm/coul/long/gpu,PairCGCMMCoulLongGPU)
#else
#ifndef LMP_PAIR_CG_CMM_COUL_LONG_GPU_H
#define LMP_PAIR_CG_CMM_COUL_LONG_GPU_H
#include "pair_cg_cmm_coul_long.h"
namespace LAMMPS_NS {
class PairCGCMMCoulLongGPU : public PairCGCMMCoulLong {
public:
PairCGCMMCoulLongGPU(LAMMPS *lmp);
~PairCGCMMCoulLongGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int);
void init_style();
double memory_usage();
enum { GPU_PAIR, GPU_NEIGH };
private:
int gpu_mode;
double cpu_time;
int *gpulist;
};
}
#endif
#endif

362
src/GPU/pair_cg_cmm_gpu.cpp Normal file
View File

@ -0,0 +1,362 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "pair_cg_cmm_gpu.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#include "update.h"
#include "domain.h"
#include "string.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
// External functions from cuda library for atom decomposition
bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen);
void cmm_gpu_clear();
int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
void cmm_gpu_compute(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success);
double cmm_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairCGCMMGPU::PairCGCMMGPU(LAMMPS *lmp) : PairCGCMM(lmp), gpu_mode(GPU_PAIR)
{
respa_enable = 0;
cpu_time = 0.0;
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairCGCMMGPU::~PairCGCMMGPU()
{
cmm_gpu_clear();
}
/* ---------------------------------------------------------------------- */
void PairCGCMMGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = cmm_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success);
} else {
inum = list->inum;
cmm_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh, list->firstneigh,
eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
success);
}
if (!success)
error->one("Out of memory on GPGPU");
if (host_start<inum) {
cpu_time = MPI_Wtime();
if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist, host_start, eflag, vflag);
else
cpu_compute(host_start, eflag, vflag);
cpu_time = MPI_Wtime() - cpu_time;
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairCGCMMGPU::init_style()
{
cut_respa = NULL;
if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles");
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
double cell_size = sqrt(maxcut) + neighbor->skin;
int maxspecial=0;
if (atom->molecular)
maxspecial=atom->maxspecial;
bool init_ok = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4,
offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).\n");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU CGCMM pair style");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
}
}
/* ---------------------------------------------------------------------- */
double PairCGCMMGPU::memory_usage()
{
double bytes = Pair::memory_usage();
return bytes + cmm_gpu_bytes();
}
/* ---------------------------------------------------------------------- */
void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) {
int i,j,ii,jj,inum,jnum,itype,jtype;
double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
double rsq,r2inv,r6inv,forcelj,factor_lj;
int *ilist,*jlist,*numneigh,**firstneigh;
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_lj = force->special_lj;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
const int cgt=cg_type[itype][jtype];
r2inv = 1.0/rsq;
fpair = factor_lj;
if (eflag) evdwl = factor_lj;
if (cgt == CG_LJ12_4) {
const double r4inv = r2inv*r2inv;
fpair *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else if (cgt == CG_LJ9_6) {
const double r3inv = r2inv*sqrt(r2inv);
const double r6inv = r3inv*r3inv;
fpair *= r6inv*(lj1[itype][jtype]*r3inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r3inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else {
const double r6inv = r2inv*r2inv*r2inv;
fpair *= r6inv*(lj1[itype][jtype]*r6inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r6inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
}
fpair *= r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairCGCMMGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
int i,j,itype,jtype;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
double rsq,r2inv,r6inv,forcelj,factor_lj;
double *special_lj = force->special_lj;
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
int *nbor = nbors + i - start;
int jnum = *nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for (; nbor<nbor_end; nbor+=stride) {
j = *nbor;
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
const int cgt=cg_type[itype][jtype];
r2inv = 1.0/rsq;
fpair = factor_lj;
if (eflag) evdwl = factor_lj;
if (cgt == CG_LJ12_4) {
const double r4inv = r2inv*r2inv;
fpair *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else if (cgt == CG_LJ9_6) {
const double r3inv = r2inv*sqrt(r2inv);
const double r6inv = r3inv*r3inv;
fpair *= r6inv*(lj1[itype][jtype]*r3inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r3inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else {
const double r6inv = r2inv*r2inv*r2inv;
fpair *= r6inv*(lj1[itype][jtype]*r6inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r6inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
}
fpair *= r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (j<start) {
if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
} else {
if (j<nlocal) {
f[j][0] -= delx*fpair;
f[j][1] -= dely*fpair;
f[j][2] -= delz*fpair;
}
if (evflag) ev_tally(i,j,nlocal,0,
evdwl,0.0,fpair,delx,dely,delz);
}
}
}
}
}

47
src/GPU/pair_cg_cmm_gpu.h Normal file
View File

@ -0,0 +1,47 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(cg/cmm/gpu,PairCGCMMGPU)
#else
#ifndef LMP_PAIR_CG_CMM_GPU_H
#define LMP_PAIR_CG_CMM_GPU_H
#include "pair_cg_cmm.h"
namespace LAMMPS_NS {
class PairCGCMMGPU : public PairCGCMM {
public:
PairCGCMMGPU(LAMMPS *lmp);
~PairCGCMMGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int);
void init_style();
double memory_usage();
enum { GPU_PAIR, GPU_NEIGH };
private:
int gpu_mode;
double cpu_time;
int *gpulist;
};
}
#endif
#endif

View File

@ -31,123 +31,47 @@
#include "error.h" #include "error.h"
#include "neigh_request.h" #include "neigh_request.h"
#include "universe.h" #include "universe.h"
#include "domain.h"
#include <string> #include "update.h"
#include "string.h"
#ifdef GB_GPU_OMP
#include "omp.h"
#endif
#define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b))
#ifndef WINDLL
// External functions from cuda library for atom decomposition // External functions from cuda library for atom decomposition
bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma, bool gb_gpu_init(const int ntypes, const double gamma, const double upsilon,
const double upsilon, const double mu, double **shape, const double mu, double **shape, double **well, double **cutsq,
double **well, double **cutsq, double **sigma, double **sigma, double **epsilon, double *host_lshape,
double **epsilon, double *host_lshape, int **form, int **form, double **host_lj1, double **host_lj2,
double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj3, double **host_lj4, double **offset,
double **host_lj4, double **offset, double *special_lj, double *special_lj, const int nlocal, const int nall,
const int nlocal, const int nall, const int max_nbors, const int max_nbors, const double cell_size,
const int thread, const int gpu_id); int &gpu_mode, FILE *screen);
void gb_gpu_clear(const int thread); void gb_gpu_clear();
int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, int * gb_gpu_compute_n(const int timestep, const int ago, const int inum,
int *ilist, const int *numj, const int *type, const int nall, double **host_x, int *host_type,
const int thread, bool &success); double *boxlo, double *boxhi, const bool eflag,
void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag, const bool vflag, const bool eatom, const bool vatom,
const int thread); int &host_start, const double cpu_time, bool &success,
void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type, double **host_quat);
const bool rebuild, const int thread); int * gb_gpu_compute(const int timestep, const int ago, const int inum,
void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, const int nall, double **host_x, int *host_type,
const int thread); int *ilist, int *numj, int **firstneigh,
double gb_gpu_forces(double **f, double **tor, const int *ilist, const bool eflag, const bool vflag, const bool eatom,
const bool eflag, const bool vflag, const bool eflag_atom, const bool vatom, int &host_start, const double cpu_time,
const bool vflag_atom, double *eatom, double **vatom, bool &success, double **host_quat);
double *virial, const int thread);
void gb_gpu_name(const int gpu_id, const int max_nbors, char * name);
void gb_gpu_time(const int thread);
int gb_gpu_num_devices();
double gb_gpu_bytes(); double gb_gpu_bytes();
#else
#include <windows.h>
typedef bool (*_gb_gpu_init)(int &ij_size, const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int nlocal, const int nall, const int max_nbors,
const int thread, const int gpu_id);
typedef void (*_gb_gpu_clear)(const int thread);
typedef int * (*_gb_gpu_reset_nbors)(const int nall, const int nlocal,
const int inum, int *ilist, const int *numj, const int *type,
const int thread, bool &success);
typedef void (*_gb_gpu_nbors)(const int *ij, const int num_ij, const bool eflag,
const int thread);
typedef void (*_gb_gpu_atom)(double **host_x, double **host_quat,
const int *host_type, const bool rebuild, const int thread);
typedef void (*_gb_gpu_gayberne)(const bool eflag, const bool vflag,
const bool rebuild, const int thread);
typedef double (*_gb_gpu_forces)(double **f, double **tor, const int *ilist,
const bool eflag, const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial, const int thread);
typedef void (*_gb_gpu_name)(const int gpu_id, const int max_nbors,
char * name);
typedef void (*_gb_gpu_time)(const int thread);
typedef int (*_gb_gpu_num_devices)();
typedef double (*_gb_gpu_bytes)();
_gb_gpu_init gb_gpu_init;
_gb_gpu_clear gb_gpu_clear;
_gb_gpu_reset_nbors gb_gpu_reset_nbors;
_gb_gpu_nbors gb_gpu_nbors;
_gb_gpu_atom gb_gpu_atom;
_gb_gpu_gayberne gb_gpu_gayberne;
_gb_gpu_forces gb_gpu_forces;
_gb_gpu_name gb_gpu_name;
_gb_gpu_time gb_gpu_time;
_gb_gpu_num_devices gb_gpu_num_devices;
_gb_gpu_bytes gb_gpu_bytes;
#endif
using namespace LAMMPS_NS; using namespace LAMMPS_NS;
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0), PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp),
omp_chunk(0), nthreads(1), gpu_mode(GPU_PAIR)
multi_gpu_mode(ONE_NODE),
multi_gpu_param(0),
output_time(false)
{ {
ij_new[0]=NULL;
#ifdef WINDLL
HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
if (hinstLib == NULL) {
printf("\nUnable to load gpu.dll\n");
exit(1);
}
gb_gpu_init=(_gb_gpu_init)GetProcAddress(hinstLib,"gb_gpu_init");
gb_gpu_clear=(_gb_gpu_clear)GetProcAddress(hinstLib,"gb_gpu_clear");
gb_gpu_reset_nbors=(_gb_gpu_reset_nbors)GetProcAddress(hinstLib,"gb_gpu_reset_nbors");
gb_gpu_nbors=(_gb_gpu_nbors)GetProcAddress(hinstLib,"gb_gpu_nbors");
gb_gpu_atom=(_gb_gpu_atom)GetProcAddress(hinstLib,"gb_gpu_atom");
gb_gpu_gayberne=(_gb_gpu_gayberne)GetProcAddress(hinstLib,"gb_gpu_gayberne");
gb_gpu_forces=(_gb_gpu_forces)GetProcAddress(hinstLib,"gb_gpu_forces");
gb_gpu_name=(_gb_gpu_name)GetProcAddress(hinstLib,"gb_gpu_name");
gb_gpu_time=(_gb_gpu_time)GetProcAddress(hinstLib,"gb_gpu_time");
gb_gpu_num_devices=(_gb_gpu_num_devices)GetProcAddress(hinstLib,"gb_gpu_num_devices");
gb_gpu_bytes=(_gb_gpu_bytes)GetProcAddress(hinstLib,"gb_gpu_bytes");
#endif
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -156,26 +80,8 @@ PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0),
PairGayBerneGPU::~PairGayBerneGPU() PairGayBerneGPU::~PairGayBerneGPU()
{ {
if (output_time) { gb_gpu_clear();
printf("\n\n-------------------------------------"); cpu_time = 0.0;
printf("--------------------------------\n");
printf(" GPU Time Stamps (on proc 0): ");
printf("\n-------------------------------------");
printf("--------------------------------\n");
gb_gpu_time(my_thread);
printf("-------------------------------------");
printf("--------------------------------\n\n");
}
#pragma omp parallel
{
#ifdef GB_GPU_OMP
int my_thread=omp_get_thread_num();
#endif
gb_gpu_clear(my_thread);
if (ij_new[my_thread]!=NULL)
delete [] ij_new[my_thread];
}
} }
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
@ -183,141 +89,38 @@ PairGayBerneGPU::~PairGayBerneGPU()
void PairGayBerneGPU::compute(int eflag, int vflag) void PairGayBerneGPU::compute(int eflag, int vflag)
{ {
if (eflag || vflag) ev_setup(eflag,vflag); if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = eflag_atom = vflag_atom = 0; else evflag = vflag_fdotr = 0;
if (vflag_atom)
error->all("Per-atom virial not available with GPU Gay-Berne");
int nlocal = atom->nlocal; int nall = atom->nlocal + atom->nghost;
int nall = nlocal + atom->nghost; int inum, host_start;
int inum = list->inum;
bool success = true;
bool rebuild=false;
if (neighbor->ncalls > last_neighbor) { if (gpu_mode == GPU_NEIGH) {
last_neighbor=neighbor->ncalls; inum = atom->nlocal;
rebuild=true; gpulist = gb_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo, domain->subhi,
eflag, vflag, eflag_atom, vflag_atom, host_start,
cpu_time, success, atom->quat);
} else {
inum = list->inum;
olist = gb_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh,
list->firstneigh, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success,
atom->quat);
} }
#pragma omp parallel
{
bool success=true;
#ifdef GB_GPU_OMP
int my_thread=omp_get_thread_num();
if (rebuild) {
omp_chunk=static_cast<int>(ceil(static_cast<double>(inum)/nthreads));
if (my_thread==nthreads-1)
thread_inum[my_thread]=inum-(nthreads-1)*omp_chunk;
else
thread_inum[my_thread]=omp_chunk;
olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal,
thread_inum[my_thread],
list->ilist+omp_chunk*my_thread,
list->numneigh, atom->type, my_thread,
success);
}
#else
if (rebuild)
olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, inum, list->ilist,
list->numneigh, atom->type, my_thread,
success);
#endif
if (!success) if (!success)
error->one("Out of memory on GPGPU"); error->one("Out of memory on GPGPU");
// copy atom data to GPU
gb_gpu_atom(atom->x,atom->quat,atom->type,rebuild,my_thread);
int i,j,ii,jj,jnum; if (host_start < inum) {
double factor_lj; cpu_time = MPI_Wtime();
int *jlist; if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist,host_start,eflag,vflag);
if (rebuild==true) { else
int num_ij = 0; cpu_compute(host_start,eflag,vflag);
cpu_time = MPI_Wtime() - cpu_time;
// loop over neighbors of my atoms
int *ijp=ij_new[my_thread];
#ifdef GB_GPU_OMP
int mgo=my_thread*omp_chunk;
int mgot=mgo+thread_inum[my_thread];
#else
int mgo=0, mgot=inum;
#endif
for (ii = mgo; ii<mgot; ii++) {
i = olist[my_thread][ii];
jlist = list->firstneigh[i];
jnum = list->numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
*ijp=j;
ijp++;
num_ij++;
if (num_ij==ij_size) {
gb_gpu_nbors(ij_new[my_thread],num_ij,eflag,my_thread);
ijp=ij_new[my_thread];
num_ij=0;
}
}
}
if (num_ij>0)
gb_gpu_nbors(ij_new[my_thread],num_ij,eflag,my_thread);
} }
gb_gpu_gayberne(eflag,vflag,rebuild,my_thread);
double lvirial[6];
for (int i=0; i<6; i++) lvirial[i]=0.0;
double my_eng=gb_gpu_forces(atom->f,atom->torque,olist[my_thread],eflag,vflag,
eflag_atom, vflag_atom, eatom, vatom, lvirial,
my_thread);
#pragma omp critical
{
eng_vdwl+=my_eng;
virial[0]+=lvirial[0];
virial[1]+=lvirial[1];
virial[2]+=lvirial[2];
virial[3]+=lvirial[3];
virial[4]+=lvirial[4];
virial[5]+=lvirial[5];
}
} //End parallel
if (vflag_fdotr) virial_compute();
}
/* ----------------------------------------------------------------------
global settings
------------------------------------------------------------------------- */
void PairGayBerneGPU::settings(int narg, char **arg)
{
// strip off GPU keyword/value and send remaining args to parent
if (narg < 2) error->all("Illegal pair_style command");
// set multi_gpu_mode to one/node for multiple gpus on 1 node
// -- param is starting gpu id
// set multi_gpu_mode to one/gpu to select the same gpu id on every node
// -- param is id of gpu
// set multi_gpu_mode to multi/gpu to get ma
// -- param is number of gpus per node
if (strcmp(arg[0],"one/node") == 0)
multi_gpu_mode = ONE_NODE;
else if (strcmp(arg[0],"one/gpu") == 0)
multi_gpu_mode = ONE_GPU;
else if (strcmp(arg[0],"multi/gpu") == 0)
multi_gpu_mode = MULTI_GPU;
else error->all("Illegal pair_style command");
multi_gpu_param = atoi(arg[1]);
if (multi_gpu_mode == MULTI_GPU && multi_gpu_param < 1)
error->all("Illegal pair_style command");
PairGayBerne::settings(narg-2,&arg[2]);
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -326,8 +129,6 @@ void PairGayBerneGPU::settings(int narg, char **arg)
void PairGayBerneGPU::init_style() void PairGayBerneGPU::init_style()
{ {
if (comm->me == 0)
output_time=true;
if (force->pair_match("gpu",0) == NULL) if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles"); error->all("Cannot use pair hybrid with multiple GPU pair styles");
if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type) if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type)
@ -335,24 +136,7 @@ void PairGayBerneGPU::init_style()
if (atom->radius_flag) if (atom->radius_flag)
error->all("Pair gayberne cannot be used with atom attribute diameter"); error->all("Pair gayberne cannot be used with atom attribute diameter");
// set the GPU ID
int my_gpu=comm->me+multi_gpu_param;
int ngpus=universe->nprocs;
if (multi_gpu_mode==ONE_GPU) {
my_gpu=multi_gpu_param;
ngpus=1;
} else if (multi_gpu_mode==MULTI_GPU) {
ngpus=multi_gpu_param;
my_gpu=comm->me%ngpus;
if (ngpus>universe->nprocs)
ngpus=universe->nprocs;
}
int irequest = neighbor->request(this);
// per-type shape precalculations // per-type shape precalculations
for (int i = 1; i <= atom->ntypes; i++) { for (int i = 1; i <= atom->ntypes; i++) {
if (setwell[i]) { if (setwell[i]) {
double *one = atom->shape[i]; double *one = atom->shape[i];
@ -364,72 +148,38 @@ void PairGayBerneGPU::init_style()
} }
// Repeat cutsq calculation because done after call to init_style // Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut; double cut;
for (int i = 1; i <= atom->ntypes; i++) for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) { for (int j = i; j <= atom->ntypes; j++) {
cut = init_one(i,j); if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cutsq[i][j] = cutsq[j][i] = cut*cut; cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
} }
}
// If compiled with OpenMP and only 1 proc, try to use multiple GPUs w/threads double cell_size = sqrt(maxcut) + neighbor->skin;
#ifdef GB_GPU_OMP
if (multi_gpu_mode!=ONE_GPU) bool init_ok = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu,
nthreads=ngpus=gb_gpu_num_devices();
else
nthreads=ngpus=1;
if (nthreads>MAX_GPU_THREADS)
nthreads=MAX_GPU_THREADS;
omp_set_num_threads(nthreads);
#endif
#pragma omp parallel firstprivate(my_gpu)
{
#ifdef GB_GPU_OMP
int my_thread = omp_get_thread_num();
if (multi_gpu_mode!=ONE_GPU)
my_gpu=my_thread;
if (multi_gpu_mode==ONE_NODE)
my_gpu+=multi_gpu_param;
#endif
bool init_ok=gb_gpu_init(ij_size, atom->ntypes+1, gamma, upsilon, mu,
shape, well, cutsq, sigma, epsilon, lshape, form, shape, well, cutsq, sigma, epsilon, lshape, form,
lj1, lj2, lj3, lj4, offset, force->special_lj, lj1, lj2, lj3, lj4, offset, force->special_lj,
atom->nlocal, atom->nlocal+atom->nghost, 300, atom->nlocal, atom->nlocal+atom->nghost, 300,
my_thread, my_gpu); cell_size, gpu_mode, screen);
if (!init_ok) if (!init_ok)
error->one("At least 1 proc could not allocate a CUDA gpu or memory"); error->one("Insufficient memory on accelerator (or no fix gpu).");
if (ij_new[my_thread]!=NULL)
delete [] ij_new[my_thread];
ij_new[my_thread]=new int[ij_size];
}
last_neighbor = -1;
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
if (force->newton_pair) if (force->newton_pair)
error->all("Cannot use newton pair with GPU GayBerne pair style"); error->all("Cannot use newton pair with GPU Gay-Berne pair style");
if (comm->me == 0 && screen) { if (gpu_mode != GPU_NEIGH) {
printf("\n-------------------------------------"); int irequest = neighbor->request(this);
printf("-------------------------------------\n"); neighbor->requests[irequest]->half = 0;
printf("- Using GPGPU acceleration for Gay-Berne:\n"); neighbor->requests[irequest]->full = 1;
printf("-------------------------------------");
printf("-------------------------------------\n");
for (int i=0; i<ngpus; i++) {
int gpui=my_gpu;
if (multi_gpu_mode==ONE_NODE)
gpui=i+multi_gpu_param;
else if (multi_gpu_mode==MULTI_GPU)
gpui=i;
char gpu_string[500];
gb_gpu_name(gpui,neighbor->oneatom,gpu_string);
printf("GPU %d: %s\n",gpui,gpu_string);
}
printf("-------------------------------------");
printf("-------------------------------------\n\n");
} }
} }
@ -437,6 +187,275 @@ void PairGayBerneGPU::init_style()
double PairGayBerneGPU::memory_usage() double PairGayBerneGPU::memory_usage()
{ {
double bytes=Pair::memory_usage()+nthreads*ij_size*sizeof(int); double bytes = Pair::memory_usage();
return bytes+gb_gpu_bytes(); return bytes + gb_gpu_bytes();
} }
/* ---------------------------------------------------------------------- */
void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag)
{
int i,j,ii,jj,inum,jnum,itype,jtype;
double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
double fforce[3],ttor[3],rtor[3],r12[3];
double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3];
int *ilist,*jlist,*numneigh,**firstneigh;
double **x = atom->x;
double **f = atom->f;
double **quat = atom->quat;
double **tor = atom->torque;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_lj = force->special_lj;
inum = list->inum;
ilist = olist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
itype = type[i];
if (form[itype][itype] == ELLIPSE_ELLIPSE) {
MathExtra::quat_to_mat_trans(quat[i],a1);
MathExtra::diag_times3(well[itype],a1,temp);
MathExtra::transpose_times3(a1,temp,b1);
MathExtra::diag_times3(shape[itype],a1,temp);
MathExtra::transpose_times3(a1,temp,g1);
}
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
// r12 = center to center vector
r12[0] = x[j][0]-x[i][0];
r12[1] = x[j][1]-x[i][1];
r12[2] = x[j][2]-x[i][2];
rsq = MathExtra::dot3(r12,r12);
jtype = type[j];
// compute if less than cutoff
if (rsq < cutsq[itype][jtype]) {
switch (form[itype][jtype]) {
case SPHERE_SPHERE:
r2inv = 1.0/rsq;
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
forcelj *= -r2inv;
if (eflag) one_eng =
r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
offset[itype][jtype];
fforce[0] = r12[0]*forcelj;
fforce[1] = r12[1]*forcelj;
fforce[2] = r12[2]*forcelj;
ttor[0] = ttor[1] = ttor[2] = 0.0;
rtor[0] = rtor[1] = rtor[2] = 0.0;
break;
case SPHERE_ELLIPSE:
MathExtra::quat_to_mat_trans(quat[j],a2);
MathExtra::diag_times3(well[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,b2);
MathExtra::diag_times3(shape[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,g2);
one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor);
ttor[0] = ttor[1] = ttor[2] = 0.0;
break;
case ELLIPSE_SPHERE:
one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor);
rtor[0] = rtor[1] = rtor[2] = 0.0;
break;
default:
MathExtra::quat_to_mat_trans(quat[j],a2);
MathExtra::diag_times3(well[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,b2);
MathExtra::diag_times3(shape[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,g2);
one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq,
fforce,ttor,rtor);
break;
}
fforce[0] *= factor_lj;
fforce[1] *= factor_lj;
fforce[2] *= factor_lj;
ttor[0] *= factor_lj;
ttor[1] *= factor_lj;
ttor[2] *= factor_lj;
f[i][0] += fforce[0];
f[i][1] += fforce[1];
f[i][2] += fforce[2];
tor[i][0] += ttor[0];
tor[i][1] += ttor[1];
tor[i][2] += ttor[2];
if (eflag) evdwl = factor_lj*one_eng;
if (evflag) ev_tally_xyz_full(i,evdwl,0.0,fforce[0],fforce[1],fforce[2],
-r12[0],-r12[1],-r12[2]);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairGayBerneGPU::cpu_compute(int *nbors, int start, int eflag, int vflag)
{
int i,j,itype,jtype;
double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
double fforce[3],ttor[3],rtor[3],r12[3];
double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3];
double **x = atom->x;
double **f = atom->f;
double **quat = atom->quat;
double **tor = atom->torque;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double *special_lj = force->special_lj;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
itype = type[i];
if (form[itype][itype] == ELLIPSE_ELLIPSE) {
MathExtra::quat_to_mat_trans(quat[i],a1);
MathExtra::diag_times3(well[itype],a1,temp);
MathExtra::transpose_times3(a1,temp,b1);
MathExtra::diag_times3(shape[itype],a1,temp);
MathExtra::transpose_times3(a1,temp,g1);
}
int *nbor = nbors+i-start;
int jnum =* nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for ( ; nbor < nbor_end; nbor += stride) {
j = *nbor;
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
// r12 = center to center vector
r12[0] = x[j][0]-x[i][0];
r12[1] = x[j][1]-x[i][1];
r12[2] = x[j][2]-x[i][2];
rsq = MathExtra::dot3(r12,r12);
jtype = type[j];
// compute if less than cutoff
if (rsq < cutsq[itype][jtype]) {
switch (form[itype][jtype]) {
case SPHERE_SPHERE:
r2inv = 1.0/rsq;
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
forcelj *= -r2inv;
if (eflag) one_eng =
r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
offset[itype][jtype];
fforce[0] = r12[0]*forcelj;
fforce[1] = r12[1]*forcelj;
fforce[2] = r12[2]*forcelj;
ttor[0] = ttor[1] = ttor[2] = 0.0;
rtor[0] = rtor[1] = rtor[2] = 0.0;
break;
case SPHERE_ELLIPSE:
MathExtra::quat_to_mat_trans(quat[j],a2);
MathExtra::diag_times3(well[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,b2);
MathExtra::diag_times3(shape[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,g2);
one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor);
ttor[0] = ttor[1] = ttor[2] = 0.0;
break;
case ELLIPSE_SPHERE:
one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor);
rtor[0] = rtor[1] = rtor[2] = 0.0;
break;
default:
MathExtra::quat_to_mat_trans(quat[j],a2);
MathExtra::diag_times3(well[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,b2);
MathExtra::diag_times3(shape[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,g2);
one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq,
fforce,ttor,rtor);
break;
}
fforce[0] *= factor_lj;
fforce[1] *= factor_lj;
fforce[2] *= factor_lj;
ttor[0] *= factor_lj;
ttor[1] *= factor_lj;
ttor[2] *= factor_lj;
f[i][0] += fforce[0];
f[i][1] += fforce[1];
f[i][2] += fforce[2];
tor[i][0] += ttor[0];
tor[i][1] += ttor[1];
tor[i][2] += ttor[2];
if (eflag) evdwl = factor_lj*one_eng;
if (j<start) {
if (evflag) ev_tally_xyz_full(i,evdwl,0.0,fforce[0],fforce[1],
fforce[2],-r12[0],-r12[1],-r12[2]);
} else {
if (j < nlocal) {
rtor[0] *= factor_lj;
rtor[1] *= factor_lj;
rtor[2] *= factor_lj;
f[j][0] -= fforce[0];
f[j][1] -= fforce[1];
f[j][2] -= fforce[2];
tor[j][0] += rtor[0];
tor[j][1] += rtor[1];
tor[j][2] += rtor[2];
}
if (evflag) ev_tally_xyz(i,j,nlocal,0,
evdwl,0.0,fforce[0],fforce[1],fforce[2],
-r12[0],-r12[1],-r12[2]);
}
}
}
}
}

View File

@ -29,21 +29,19 @@ class PairGayBerneGPU : public PairGayBerne {
public: public:
PairGayBerneGPU(LAMMPS *lmp); PairGayBerneGPU(LAMMPS *lmp);
~PairGayBerneGPU(); ~PairGayBerneGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int); void compute(int, int);
void settings(int, char **);
void init_style(); void init_style();
double memory_usage(); double memory_usage();
enum { ONE_NODE, ONE_GPU, MULTI_GPU };
private: enum { GPU_PAIR, GPU_NEIGH };
int ij_size;
int *ij_new[MAX_GPU_THREADS], *olist[MAX_GPU_THREADS]; private:
int *olist;
int my_thread, nthreads, thread_inum[MAX_GPU_THREADS], omp_chunk; int gpu_mode;
double cpu_time;
int last_neighbor, multi_gpu_mode, multi_gpu_param; int *gpulist;
bool output_time;
}; };
} }

View File

@ -0,0 +1,318 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "pair_lj96_cut_gpu.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#include "update.h"
#include "domain.h"
#include "string.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
// External functions from cuda library for atom decomposition
bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int nlocal,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen);
void lj96_gpu_clear();
int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
void lj96_gpu_compute(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success);
double lj96_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairLJ96CutGPU::PairLJ96CutGPU(LAMMPS *lmp) : PairLJ96Cut(lmp), gpu_mode(GPU_PAIR)
{
respa_enable = 0;
cpu_time = 0.0;
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairLJ96CutGPU::~PairLJ96CutGPU()
{
lj96_gpu_clear();
}
/* ---------------------------------------------------------------------- */
void PairLJ96CutGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = lj96_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success);
} else {
inum = list->inum;
lj96_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh, list->firstneigh,
eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
success);
}
if (!success)
error->one("Out of memory on GPGPU");
if (host_start<inum) {
cpu_time = MPI_Wtime();
if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist, host_start, eflag, vflag);
else
cpu_compute(host_start, eflag, vflag);
cpu_time = MPI_Wtime() - cpu_time;
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairLJ96CutGPU::init_style()
{
cut_respa = NULL;
if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles");
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
double cell_size = sqrt(maxcut) + neighbor->skin;
int maxspecial=0;
if (atom->molecular)
maxspecial=atom->maxspecial;
bool init_ok = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).\n");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU LJ96 pair style");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
}
}
/* ---------------------------------------------------------------------- */
double PairLJ96CutGPU::memory_usage()
{
double bytes = Pair::memory_usage();
return bytes + lj96_gpu_bytes();
}
/* ---------------------------------------------------------------------- */
void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) {
int i,j,ii,jj,inum,jnum,itype,jtype;
double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj;
int *ilist,*jlist,*numneigh,**firstneigh;
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_lj = force->special_lj;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
r6inv = r2inv*r2inv*r2inv;
r3inv = sqrt(r6inv);
forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
fpair = factor_lj*forcelj*r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
}
if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairLJ96CutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
int i,j,itype,jtype;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj;
double *special_lj = force->special_lj;
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
int *nbor = nbors + i - start;
int jnum = *nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for (; nbor<nbor_end; nbor+=stride) {
j = *nbor;
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
r6inv = r2inv*r2inv*r2inv;
r3inv = sqrt(r6inv);
forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
fpair = factor_lj*forcelj*r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
}
if (j<start) {
if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
} else {
if (j<nlocal) {
f[j][0] -= delx*fpair;
f[j][1] -= dely*fpair;
f[j][2] -= delz*fpair;
}
if (evflag) ev_tally(i,j,nlocal,0,
evdwl,0.0,fpair,delx,dely,delz);
}
}
}
}
}

View File

@ -0,0 +1,48 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(lj96/cut/gpu,PairLJ96CutGPU)
#else
#ifndef LMP_PAIR_LJ_96_GPU_H
#define LMP_PAIR_LJ_96_GPU_H
#include "pair_lj96_cut.h"
namespace LAMMPS_NS {
class PairLJ96CutGPU : public PairLJ96Cut {
public:
PairLJ96CutGPU(LAMMPS *lmp);
~PairLJ96CutGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int);
void init_style();
double memory_usage();
enum { GPU_PAIR, GPU_NEIGH };
private:
int gpu_mode;
double cpu_time;
int *gpulist;
};
}
#endif
#endif

View File

@ -0,0 +1,364 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "pair_lj_cut_coul_cut_gpu.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#include "update.h"
#include "domain.h"
#include "string.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
// External functions from cuda library for atom decomposition
bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int nlocal,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e);
void ljc_gpu_clear();
int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q);
void ljc_gpu_compute(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q);
double ljc_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairLJCutCoulCutGPU::PairLJCutCoulCutGPU(LAMMPS *lmp) : PairLJCutCoulCut(lmp), gpu_mode(GPU_PAIR)
{
respa_enable = 0;
cpu_time = 0.0;
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairLJCutCoulCutGPU::~PairLJCutCoulCutGPU()
{
ljc_gpu_clear();
}
/* ---------------------------------------------------------------------- */
void PairLJCutCoulCutGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = ljc_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success,
atom->q);
} else {
inum = list->inum;
ljc_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh, list->firstneigh,
eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
success, atom->q);
}
if (!success)
error->one("Out of memory on GPGPU");
if (host_start<inum) {
cpu_time = MPI_Wtime();
if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist, host_start, eflag, vflag);
else
cpu_compute(host_start, eflag, vflag);
cpu_time = MPI_Wtime() - cpu_time;
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairLJCutCoulCutGPU::init_style()
{
if (!atom->q_flag)
error->all("Pair style lj/cut/coul/cut requires atom attribute q");
if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles");
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
double cell_size = sqrt(maxcut) + neighbor->skin;
int maxspecial=0;
if (atom->molecular)
maxspecial=atom->maxspecial;
bool init_ok = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
force->special_coul, force->qqrd2e);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).\n");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU LJ pair style");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
}
}
/* ---------------------------------------------------------------------- */
double PairLJCutCoulCutGPU::memory_usage()
{
double bytes = Pair::memory_usage();
return bytes + ljc_gpu_bytes();
}
/* ---------------------------------------------------------------------- */
void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag)
{
int i,j,ii,jj,inum,jnum,itype,jtype;
double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
int *ilist,*jlist,*numneigh,**firstneigh;
evdwl = ecoul = 0.0;
double **x = atom->x;
double **f = atom->f;
double *q = atom->q;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_coul = force->special_coul;
double *special_lj = force->special_lj;
int newton_pair = force->newton_pair;
double qqrd2e = force->qqrd2e;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
qtmp = q[i];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_coul = factor_lj = 1.0;
else {
factor_coul = special_coul[j/nall];
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
if (rsq < cut_coulsq[itype][jtype])
forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
else forcecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
} else forcelj = 0.0;
fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
if (rsq < cut_coulsq[itype][jtype])
ecoul = factor_coul * qqrd2e * qtmp*q[j]*sqrt(r2inv);
else ecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
} else evdwl = 0.0;
}
if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairLJCutCoulCutGPU::cpu_compute(int *nbors, int start, int eflag,
int vflag)
{
int i,j,jnum,itype,jtype;
double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
evdwl = ecoul = 0.0;
double **x = atom->x;
double **f = atom->f;
double *q = atom->q;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double *special_coul = force->special_coul;
double *special_lj = force->special_lj;
double qqrd2e = force->qqrd2e;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
qtmp = q[i];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
int *nbor = nbors + i - start;
jnum = *nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for (; nbor<nbor_end; nbor+=stride) {
j = *nbor;
if (j < nall) factor_coul = factor_lj = 1.0;
else {
factor_coul = special_coul[j/nall];
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
if (rsq < cut_coulsq[itype][jtype])
forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
else forcecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
} else forcelj = 0.0;
fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
if (rsq < cut_coulsq[itype][jtype])
ecoul = factor_coul * qqrd2e * qtmp*q[j]*sqrt(r2inv);
else ecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
} else evdwl = 0.0;
}
if (j<start) {
if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
} else {
if (j<nlocal) {
f[j][0] -= delx*fpair;
f[j][1] -= dely*fpair;
f[j][2] -= delz*fpair;
}
if (evflag) ev_tally(i,j,nlocal,0,
evdwl,ecoul,fpair,delx,dely,delz);
}
}
}
}
}

View File

@ -0,0 +1,48 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(lj/cut/coul/cut/gpu,PairLJCutCoulCutGPU)
#else
#ifndef LMP_PAIR_LJ_CUT_COUL_CUT_GPU_H
#define LMP_PAIR_LJ_CUT_COUL_CUT_GPU_H
#include "pair_lj_cut_coul_cut.h"
namespace LAMMPS_NS {
class PairLJCutCoulCutGPU : public PairLJCutCoulCut {
public:
PairLJCutCoulCutGPU(LAMMPS *lmp);
~PairLJCutCoulCutGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int);
void init_style();
double memory_usage();
enum { GPU_PAIR, GPU_NEIGH };
private:
int gpu_mode;
double cpu_time;
int *gpulist;
};
}
#endif
#endif

View File

@ -0,0 +1,452 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "pair_lj_cut_coul_long_gpu.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#include "update.h"
#include "domain.h"
#include "string.h"
#include "kspace.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define EWALD_F 1.12837917
#define EWALD_P 0.3275911
#define A1 0.254829592
#define A2 -0.284496736
#define A3 1.421413741
#define A4 -1.453152027
#define A5 1.061405429
// External functions from cuda library for atom decomposition
bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int nlocal,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald);
void ljcl_gpu_clear();
int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q);
void ljcl_gpu_compute(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q);
double ljcl_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairLJCutCoulLongGPU::PairLJCutCoulLongGPU(LAMMPS *lmp) : PairLJCutCoulLong(lmp), gpu_mode(GPU_PAIR)
{
respa_enable = 0;
cpu_time = 0.0;
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairLJCutCoulLongGPU::~PairLJCutCoulLongGPU()
{
ljcl_gpu_clear();
}
/* ---------------------------------------------------------------------- */
void PairLJCutCoulLongGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = ljcl_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success,
atom->q);
} else {
inum = list->inum;
ljcl_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh, list->firstneigh,
eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
success, atom->q);
}
if (!success)
error->one("Out of memory on GPGPU");
if (host_start<inum) {
cpu_time = MPI_Wtime();
if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist, host_start, eflag, vflag);
else
cpu_compute(host_start, eflag, vflag);
cpu_time = MPI_Wtime() - cpu_time;
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairLJCutCoulLongGPU::init_style()
{
cut_respa = NULL;
if (!atom->q_flag)
error->all("Pair style lj/cut/coul/cut requires atom attribute q");
if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles");
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
double cell_size = sqrt(maxcut) + neighbor->skin;
cut_coulsq = cut_coul * cut_coul;
// insure use of KSpace long-range solver, set g_ewald
if (force->kspace == NULL)
error->all("Pair style is incompatible with KSpace style");
g_ewald = force->kspace->g_ewald;
// setup force tables
if (ncoultablebits) init_tables();
int maxspecial=0;
if (atom->molecular)
maxspecial=atom->maxspecial;
bool init_ok = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
force->special_coul, force->qqrd2e, g_ewald);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).\n");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU LJ pair style");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
}
}
/* ---------------------------------------------------------------------- */
double PairLJCutCoulLongGPU::memory_usage()
{
double bytes = Pair::memory_usage();
return bytes + ljcl_gpu_bytes();
}
/* ---------------------------------------------------------------------- */
void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
{
int i,j,ii,jj,inum,jnum,itype,jtype,itable;
double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
double fraction,table;
double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
double grij,expm2,prefactor,t,erfc;
int *ilist,*jlist,*numneigh,**firstneigh;
double rsq;
evdwl = ecoul = 0.0;
double **x = atom->x;
double **f = atom->f;
double *q = atom->q;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_coul = force->special_coul;
double *special_lj = force->special_lj;
double qqrd2e = force->qqrd2e;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
qtmp = q[i];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_coul = factor_lj = 1.0;
else {
factor_coul = special_coul[j/nall];
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
if (rsq < cut_coulsq) {
if (!ncoultablebits || rsq <= tabinnersq) {
r = sqrt(rsq);
grij = g_ewald * r;
expm2 = exp(-grij*grij);
t = 1.0 / (1.0 + EWALD_P*grij);
erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*q[j]/r;
forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
} else {
union_int_float_t rsq_lookup;
rsq_lookup.f = rsq;
itable = rsq_lookup.i & ncoulmask;
itable >>= ncoulshiftbits;
fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
table = ftable[itable] + fraction*dftable[itable];
forcecoul = qtmp*q[j] * table;
if (factor_coul < 1.0) {
table = ctable[itable] + fraction*dctable[itable];
prefactor = qtmp*q[j] * table;
forcecoul -= (1.0-factor_coul)*prefactor;
}
}
} else forcecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
} else forcelj = 0.0;
fpair = (forcecoul + factor_lj*forcelj) * r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
if (rsq < cut_coulsq) {
if (!ncoultablebits || rsq <= tabinnersq)
ecoul = prefactor*erfc;
else {
table = etable[itable] + fraction*detable[itable];
ecoul = qtmp*q[j] * table;
}
if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
} else ecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
} else evdwl = 0.0;
}
if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairLJCutCoulLongGPU::cpu_compute(int *nbors, int start, int eflag,
int vflag)
{
int i,j,jnum,itype,jtype,itable;
double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
double fraction,table;
double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
double grij,expm2,prefactor,t,erfc;
double rsq;
evdwl = ecoul = 0.0;
double **x = atom->x;
double **f = atom->f;
double *q = atom->q;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double *special_coul = force->special_coul;
double *special_lj = force->special_lj;
double qqrd2e = force->qqrd2e;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
qtmp = q[i];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
int *nbor = nbors + i - start;
jnum = *nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for (; nbor<nbor_end; nbor+=stride) {
j = *nbor;
if (j < nall) factor_coul = factor_lj = 1.0;
else {
factor_coul = special_coul[j/nall];
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
if (rsq < cut_coulsq) {
if (!ncoultablebits || rsq <= tabinnersq) {
r = sqrt(rsq);
grij = g_ewald * r;
expm2 = exp(-grij*grij);
t = 1.0 / (1.0 + EWALD_P*grij);
erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*q[j]/r;
forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
} else {
union_int_float_t rsq_lookup;
rsq_lookup.f = rsq;
itable = rsq_lookup.i & ncoulmask;
itable >>= ncoulshiftbits;
fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
table = ftable[itable] + fraction*dftable[itable];
forcecoul = qtmp*q[j] * table;
if (factor_coul < 1.0) {
table = ctable[itable] + fraction*dctable[itable];
prefactor = qtmp*q[j] * table;
forcecoul -= (1.0-factor_coul)*prefactor;
}
}
} else forcecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
} else forcelj = 0.0;
fpair = (forcecoul + factor_lj*forcelj) * r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
if (rsq < cut_coulsq) {
if (!ncoultablebits || rsq <= tabinnersq)
ecoul = prefactor*erfc;
else {
table = etable[itable] + fraction*detable[itable];
ecoul = qtmp*q[j] * table;
}
if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
} else ecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
} else evdwl = 0.0;
}
if (j<start) {
if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
} else {
if (j<nlocal) {
f[j][0] -= delx*fpair;
f[j][1] -= dely*fpair;
f[j][2] -= delz*fpair;
}
if (evflag) ev_tally(i,j,nlocal,0,
evdwl,ecoul,fpair,delx,dely,delz);
}
}
}
}
}

View File

@ -0,0 +1,48 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(lj/cut/coul/long/gpu,PairLJCutCoulLongGPU)
#else
#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_GPU_H
#define LMP_PAIR_LJ_CUT_COUL_LONG_GPU_H
#include "pair_lj_cut_coul_long.h"
namespace LAMMPS_NS {
class PairLJCutCoulLongGPU : public PairLJCutCoulLong {
public:
PairLJCutCoulLongGPU(LAMMPS *lmp);
~PairLJCutCoulLongGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int);
void init_style();
double memory_usage();
enum { GPU_PAIR, GPU_NEIGH };
private:
int gpu_mode;
double cpu_time;
int *gpulist;
};
}
#endif
#endif

View File

@ -2,28 +2,24 @@
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the GNU General Public License. the GNU General Public License.
See the README file in the top-level LAMMPS directory. See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov Contributing author: Mike Brown (SNL)
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#include "math.h" #include "math.h"
#include "stdio.h" #include "stdio.h"
#include "stdlib.h" #include "stdlib.h"
#include "pair_lj_cut_gpu.h" #include "pair_lj_cut_gpu.h"
#include "math_extra.h"
#include "atom.h" #include "atom.h"
#include "domain.h"
#include "atom_vec.h" #include "atom_vec.h"
#include "comm.h" #include "comm.h"
#include "force.h" #include "force.h"
@ -32,79 +28,45 @@
#include "integrate.h" #include "integrate.h"
#include "memory.h" #include "memory.h"
#include "error.h" #include "error.h"
#include "neigh_request.h"
#include "universe.h" #include "universe.h"
#include "update.h"
#include <string> #include "domain.h"
#include "string.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b))
#ifndef WINDLL // External functions from cuda library for atom decomposition
// External functions from cuda library for force decomposition bool ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, double **offset, double *special_lj, const int nlocal,
double **epsilon, double **host_lj1, double **host_lj2, const int nall, const int max_nbors, const int maxspecial,
double **host_lj3, double **host_lj4, double **offset, const double cell_size, int &gpu_mode, FILE *screen);
double *special_lj, double *boxlo, double *boxhi, double cell_len, double skin, void ljl_gpu_clear();
const int max_nbors, const int gpu_id); int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum,
void lj_gpu_clear(); const int nall, double **host_x, int *host_type,
double lj_gpu_cell(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, double *boxlo, double *boxhi, int *tag, int **nspecial,
const int ago, const bool eflag, const bool vflag, int **special, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi); const bool eatom, const bool vatom, int &host_start,
void lj_gpu_name(const int gpu_id, const int max_nbors, char * name); const double cpu_time, bool &success);
void lj_gpu_time(); void ljl_gpu_compute(const int timestep, const int ago, const int inum,
double lj_gpu_bytes(); const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
#else const bool eflag, const bool vflag, const bool eatom,
#include <windows.h> const bool vatom, int &host_start, const double cpu_time,
bool &success);
typedef bool (*_lj_gpu_init)(int &ij_size, const int ntypes, double **cutsq,double **sigma, double ljl_gpu_bytes();
double **epsilon, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4, double **offset,
double *special_lj, double *boxlo, double *boxhi, double cell_len, double skin,
const int max_nbors, const int gpu_id);
typedef void (*_lj_gpu_clear)();
typedef double (*_lj_gpu_cell)(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall,
const int ago, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi);
typedef void (*_lj_gpu_name)(const int gpu_id, const int max_nbors, char * name);
typedef void (*_lj_gpu_time)();
typedef double (*_lj_gpu_bytes)();
_lj_gpu_init lj_gpu_init;
_lj_gpu_clear lj_gpu_clear;
_lj_gpu_cell lj_gpu_cell;
_lj_gpu_name lj_gpu_name;
_lj_gpu_time lj_gpu_time;
_lj_gpu_bytes lj_gpu_bytes;
#endif
using namespace LAMMPS_NS; using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0) PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), gpu_mode(GPU_PAIR)
{ {
ij_new=NULL;
respa_enable = 0; respa_enable = 0;
cpu_time = 0.0;
#ifdef WINDLL
HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
if (hinstLib == NULL) {
printf("\nUnable to load gpu.dll\n");
exit(1);
}
lj_gpu_init=(_lj_gpu_init)GetProcAddress(hinstLib,"lj_gpu_init");
lj_gpu_clear=(_lj_gpu_clear)GetProcAddress(hinstLib,"lj_gpu_clear");
lj_gpu_cell=(_lj_gpu_cell)GetProcAddress(hinstLib,"lj_gpu_cell");
lj_gpu_name=(_lj_gpu_name)GetProcAddress(hinstLib,"lj_gpu_name");
lj_gpu_time=(_lj_gpu_time)GetProcAddress(hinstLib,"lj_gpu_time");
lj_gpu_bytes=(_lj_gpu_bytes)GetProcAddress(hinstLib,"lj_gpu_bytes");
#endif
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -113,17 +75,7 @@ PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0)
PairLJCutGPU::~PairLJCutGPU() PairLJCutGPU::~PairLJCutGPU()
{ {
printf("\n\n-------------------------------------"); ljl_gpu_clear();
printf("--------------------------------\n");
printf(" GPU Time Stamps: ");
printf("\n-------------------------------------");
printf("--------------------------------\n");
lj_gpu_time();
printf("-------------------------------------");
printf("--------------------------------\n\n");
lj_gpu_clear();
if (ij_new!=NULL)
delete [] ij_new;
} }
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
@ -132,45 +84,37 @@ void PairLJCutGPU::compute(int eflag, int vflag)
{ {
if (eflag || vflag) ev_setup(eflag,vflag); if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0; else evflag = vflag_fdotr = 0;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = ljl_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success);
} else {
inum = list->inum;
ljl_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh, list->firstneigh,
eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
success);
}
if (!success)
error->one("Out of memory on GPGPU");
// compute forces on GPU if (host_start<inum) {
eng_vdwl = lj_gpu_cell(atom->f, virial, atom->x, atom->type, atom->nlocal, atom->nlocal + atom->nghost, cpu_time = MPI_Wtime();
neighbor->ago, eflag, vflag, domain->boxlo, domain->boxhi); if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist, host_start, eflag, vflag);
if (vflag_fdotr) virial_compute(); else
} cpu_compute(host_start, eflag, vflag);
cpu_time = MPI_Wtime() - cpu_time;
/* ---------------------------------------------------------------------- }
global settings
------------------------------------------------------------------------- */
void PairLJCutGPU::settings(int narg, char **arg)
{
// strip off GPU keyword/value and send remaining args to parent
if (narg < 2) error->all("Illegal pair_style command");
// set multi_gpu_mode to one/node for multiple gpus on 1 node
// -- param is starting gpu id
// set multi_gpu_mode to one/gpu to select the same gpu id on every node
// -- param is id of gpu
// set multi_gpu_mode to multi/gpu to get ma
// -- param is number of gpus per node
if (strcmp(arg[0],"one/node") == 0)
multi_gpu_mode = ONE_NODE;
else if (strcmp(arg[0],"one/gpu") == 0)
multi_gpu_mode = ONE_GPU;
else if (strcmp(arg[0],"multi/gpu") == 0)
multi_gpu_mode = MULTI_GPU;
else error->all("Illegal pair_style command");
multi_gpu_param = atoi(arg[1]);
if (multi_gpu_mode == MULTI_GPU && multi_gpu_param < 1)
error->all("Illegal pair_style command");
PairLJCut::settings(narg-2,&arg[2]);
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -179,74 +123,45 @@ void PairLJCutGPU::settings(int narg, char **arg)
void PairLJCutGPU::init_style() void PairLJCutGPU::init_style()
{ {
cut_respa = NULL;
if (force->pair_match("gpu",0) == NULL) if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles"); error->all("Cannot use pair hybrid with multiple GPU pair styles");
// set the GPU ID
int my_gpu=comm->me+multi_gpu_param;
int ngpus=universe->nprocs;
if (multi_gpu_mode==ONE_GPU) {
my_gpu=multi_gpu_param;
ngpus=1;
} else if (multi_gpu_mode==MULTI_GPU) {
ngpus=multi_gpu_param;
my_gpu=comm->me%ngpus;
if (ngpus>universe->nprocs)
ngpus=universe->nprocs;
}
cut_respa=NULL;
// Repeat cutsq calculation because done after call to init_style // Repeat cutsq calculation because done after call to init_style
double cut;
for (int i = 1; i <= atom->ntypes; i++)
for (int j = i; j <= atom->ntypes; j++) {
cut = init_one(i,j);
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
// use the max cutoff length as the cell length
double maxcut = -1.0; double maxcut = -1.0;
for (int i = 1; i <= atom->ntypes; i++) double cut;
for (int j = i; j <= atom->ntypes; j++) for (int i = 1; i <= atom->ntypes; i++) {
if (cutsq[i][j] > maxcut) maxcut = cutsq[i][j]; for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
// for this problem, adding skin results in better perf cut = init_one(i,j);
// this may be a parameter in the future cut *= cut;
double cell_len = sqrt(maxcut) + neighbor->skin; if (cut > maxcut)
maxcut = cut;
if (!lj_gpu_init(ij_size, atom->ntypes+1, cutsq, sigma, epsilon, lj1, lj2, lj3, cutsq[i][j] = cutsq[j][i] = cut;
lj4, offset, force->special_lj, domain->boxlo, domain->boxhi, } else
cell_len, neighbor->skin, neighbor->oneatom, my_gpu)) cutsq[i][j] = cutsq[j][i] = 0.0;
error->one("At least one process could not allocate a CUDA-enabled gpu");
if (ij_new!=NULL)
delete [] ij_new;
ij_new=new int[ij_size];
if (force->newton_pair)
error->all("Cannot use newton pair with GPU lj/cut pair style");
if (comm->me == 0 && screen) {
printf("\n-------------------------------------");
printf("-------------------------------------\n");
printf("- Using GPGPU acceleration for LJ-Cut:\n");
printf("-------------------------------------");
printf("-------------------------------------\n");
for (int i=0; i<ngpus; i++) {
int gpui=my_gpu;
if (multi_gpu_mode==ONE_NODE)
gpui=i+multi_gpu_param;
else if (multi_gpu_mode==MULTI_GPU)
gpui=i;
char gpu_string[500];
lj_gpu_name(gpui,neighbor->oneatom,gpu_string);
printf("GPU %d: %s\n",gpui,gpu_string);
} }
printf("-------------------------------------"); }
printf("-------------------------------------\n\n"); double cell_size = sqrt(maxcut) + neighbor->skin;
int maxspecial=0;
if (atom->molecular)
maxspecial=atom->maxspecial;
bool init_ok = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).\n");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU LJ pair style");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
} }
} }
@ -254,6 +169,149 @@ void PairLJCutGPU::init_style()
double PairLJCutGPU::memory_usage() double PairLJCutGPU::memory_usage()
{ {
double bytes=Pair::memory_usage()+ij_size*sizeof(int); double bytes = Pair::memory_usage();
return bytes+lj_gpu_bytes(); return bytes + ljl_gpu_bytes();
} }
/* ---------------------------------------------------------------------- */
void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) {
int i,j,ii,jj,inum,jnum,itype,jtype;
double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
double rsq,r2inv,r6inv,forcelj,factor_lj;
int *ilist,*jlist,*numneigh,**firstneigh;
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_lj = force->special_lj;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
fpair = factor_lj*forcelj*r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
}
if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairLJCutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
int i,j,itype,jtype;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
double rsq,r2inv,r6inv,forcelj,factor_lj;
double *special_lj = force->special_lj;
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
int *nbor = nbors + i - start;
int jnum = *nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for (; nbor<nbor_end; nbor+=stride) {
j = *nbor;
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
fpair = factor_lj*forcelj*r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
}
if (j<start) {
if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
} else {
if (j<nlocal) {
f[j][0] -= delx*fpair;
f[j][1] -= dely*fpair;
f[j][2] -= delz*fpair;
}
if (evflag) ev_tally(i,j,nlocal,0,
evdwl,0.0,fpair,delx,dely,delz);
}
}
}
}
}

View File

@ -17,8 +17,8 @@ PairStyle(lj/cut/gpu,PairLJCutGPU)
#else #else
#ifndef LMP_PAIR_LJ_CUT_GPU_H #ifndef LMP_PAIR_LJ_LIGHT_GPU_H
#define LMP_PAIR_LJ_CUT_GPU_H #define LMP_PAIR_LJ_LIGHT_GPU_H
#include "pair_lj_cut.h" #include "pair_lj_cut.h"
@ -28,20 +28,21 @@ class PairLJCutGPU : public PairLJCut {
public: public:
PairLJCutGPU(LAMMPS *lmp); PairLJCutGPU(LAMMPS *lmp);
~PairLJCutGPU(); ~PairLJCutGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int); void compute(int, int);
void settings(int, char **);
void init_style(); void init_style();
double memory_usage(); double memory_usage();
enum { ONE_NODE, ONE_GPU, MULTI_GPU }; enum { GPU_PAIR, GPU_NEIGH };
private: private:
int ij_size; int gpu_mode;
int *ij_new; double cpu_time;
int *gpulist;
int last_neighbor, multi_gpu_mode, multi_gpu_param;
}; };
} }
#endif #endif
#endif #endif