From e2707ca96f37c10fc982ea99fdde9db68b7c0784 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Thu, 28 Apr 2011 16:16:54 +0000 Subject: [PATCH 01/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6026 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/fix_rigid.cpp | 35 +++++++++++------------------------ src/math_extra.cpp | 6 ++---- src/math_extra.h | 12 ++++++++++++ 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/src/fix_rigid.cpp b/src/fix_rigid.cpp index 1099f09d7a..4d0686409d 100644 --- a/src/fix_rigid.cpp +++ b/src/fix_rigid.cpp @@ -648,8 +648,7 @@ void FixRigid::init() sum[ibody][0] += 0.4 * massone * radius[i]*radius[i]; sum[ibody][1] += 0.4 * massone * radius[i]*radius[i]; sum[ibody][2] += 0.4 * massone * radius[i]*radius[i]; - } - if (eflags[i] & INERTIA_ELLIPSOID) { + } else if (eflags[i] & INERTIA_ELLIPSOID) { shape = ebonus[ellipsoid[i]].shape; quatatom = ebonus[ellipsoid[i]].quat; MathExtra::inertia_ellipsoid(shape,quatatom,massone,ivec); @@ -665,11 +664,12 @@ void FixRigid::init() MPI_Allreduce(sum[0],all[0],6*nbody,MPI_DOUBLE,MPI_SUM,world); + // diagonalize inertia tensor for each body via Jacobi rotations // inertia = 3 eigenvalues = principal moments of inertia - // ex_space,ey_space,ez_space = 3 eigenvectors = principal axes of rigid body - + // evectors and exzy_space = 3 evectors = principal axes of rigid body + int ierror; - double ez0,ez1,ez2; + double cross[3]; double tensor[3][3],evectors[3][3]; for (ibody = 0; ibody < nbody; ibody++) { @@ -686,11 +686,9 @@ void FixRigid::init() ex_space[ibody][0] = evectors[0][0]; ex_space[ibody][1] = evectors[1][0]; ex_space[ibody][2] = evectors[2][0]; - ey_space[ibody][0] = evectors[0][1]; ey_space[ibody][1] = evectors[1][1]; ey_space[ibody][2] = evectors[2][1]; - ez_space[ibody][0] = evectors[0][2]; ez_space[ibody][1] = evectors[1][2]; ez_space[ibody][2] = evectors[2][2]; @@ -706,21 +704,11 @@ void FixRigid::init() if (inertia[ibody][2] < EPSILON*max) inertia[ibody][2] = 0.0; // enforce 3 evectors as a right-handed coordinate system - // flip 3rd evector if needed - - ez0 = ex_space[ibody][1]*ey_space[ibody][2] - - ex_space[ibody][2]*ey_space[ibody][1]; - ez1 = ex_space[ibody][2]*ey_space[ibody][0] - - ex_space[ibody][0]*ey_space[ibody][2]; - ez2 = ex_space[ibody][0]*ey_space[ibody][1] - - ex_space[ibody][1]*ey_space[ibody][0]; - - if (ez0*ez_space[ibody][0] + ez1*ez_space[ibody][1] + - ez2*ez_space[ibody][2] < 0.0) { - ez_space[ibody][0] = -ez_space[ibody][0]; - ez_space[ibody][1] = -ez_space[ibody][1]; - ez_space[ibody][2] = -ez_space[ibody][2]; - } + // flip 3rd vector if needed + + MathExtra::cross3(ex_space[ibody],ey_space[ibody],cross); + if (MathExtra::dot3(cross,ez_space[ibody]) < 0.0) + MathExtra::negate3(ez_space[ibody]); // create initial quaternion @@ -823,8 +811,7 @@ void FixRigid::init() sum[ibody][0] += 0.4 * massone * radius[i]*radius[i]; sum[ibody][1] += 0.4 * massone * radius[i]*radius[i]; sum[ibody][2] += 0.4 * massone * radius[i]*radius[i]; - } - if (eflags[i] & INERTIA_ELLIPSOID) { + } else if (eflags[i] & INERTIA_ELLIPSOID) { shape = ebonus[ellipsoid[i]].shape; MathExtra::inertia_ellipsoid(shape,qorient[i],massone,ivec); sum[ibody][0] += ivec[0]; diff --git a/src/math_extra.cpp b/src/math_extra.cpp index c4318e8bbe..5160262aff 100644 --- a/src/math_extra.cpp +++ b/src/math_extra.cpp @@ -487,7 +487,7 @@ void inertia_line(double length, double theta, double mass, double *inertia) /* ---------------------------------------------------------------------- compute space-frame inertia tensor of a triangle v0,v1,v2 = 3 vertices of triangle - from http://en.wikipedia.org/wiki/Inertia_tensor_of_triangle: + from http://en.wikipedia.org/wiki/Inertia_tensor_of_triangle inertia tensor = a/24 (v0^2 + v1^2 + v2^2 + (v0+v1+v2)^2) I - a Vt S V a = 2*area of tri = |(v1-v0) x (v2-v0)| I = 3x3 identity matrix @@ -523,9 +523,7 @@ void inertia_triangle(double *v0, double *v1, double *v2, sub3(v2,v0,v2mv0); cross3(v1mv0,v2mv0,normal); double a = len3(normal); - double inv24 = 1.0/24.0; - - // NOTE: use mass + double inv24 = mass/24.0; inertia[0] = inv24*a*(sum-vtsv[0][0]); inertia[1] = inv24*a*(sum-vtsv[1][1]); diff --git a/src/math_extra.h b/src/math_extra.h index 3ca98f8f12..44af2e9a8a 100755 --- a/src/math_extra.h +++ b/src/math_extra.h @@ -30,6 +30,7 @@ namespace MathExtra { inline void norm3(double *v); inline void normalize3(const double *v, double *ans); inline void snormalize3(const double, const double *v, double *ans); + inline void negate3(double *v); inline void add3(const double *v1, const double *v2, double *ans); inline void sub3(const double *v1, const double *v2, double *ans); inline double len3(const double *v); @@ -156,6 +157,17 @@ void MathExtra::snormalize3(const double length, const double *v, double *ans) ans[2] = v[2]*scale; } +/* ---------------------------------------------------------------------- + negate vector v +------------------------------------------------------------------------- */ + +void MathExtra::negate3(double *v) +{ + v[0] = -v[0]; + v[1] = -v[1]; + v[2] = -v[2]; +} + /* ---------------------------------------------------------------------- ans = v1 + v2 ------------------------------------------------------------------------- */ From bce11d1402ab028a34a9b63e10883c56e7be8da1 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Thu, 28 Apr 2011 18:52:25 +0000 Subject: [PATCH 02/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6029 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/compute_cluster_atom.cpp | 2 +- src/fix_langevin.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/compute_cluster_atom.cpp b/src/compute_cluster_atom.cpp index 38a63d078c..0f3f3dd709 100644 --- a/src/compute_cluster_atom.cpp +++ b/src/compute_cluster_atom.cpp @@ -104,7 +104,7 @@ void ComputeClusterAtom::compute_peratom() // grow clusterID array if necessary - if (atom->nlocal > nmax) { + if (atom->nlocal+atom->nghost > nmax) { memory->destroy(clusterID); nmax = atom->nmax; memory->create(clusterID,nmax,"cluster/atom:clusterID"); diff --git a/src/fix_langevin.cpp b/src/fix_langevin.cpp index 47699bfac7..37da93f90d 100644 --- a/src/fix_langevin.cpp +++ b/src/fix_langevin.cpp @@ -338,7 +338,6 @@ void FixLangevin::post_force_no_tally() } } } - } /* ---------------------------------------------------------------------- */ From 995a92b9f3efa35a4a2574069583011d7e2772ca Mon Sep 17 00:00:00 2001 From: sjplimp Date: Thu, 28 Apr 2011 18:52:32 +0000 Subject: [PATCH 03/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6030 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index 6970bbeb4a..66a97ccc65 100644 --- a/src/version.h +++ b/src/version.h @@ -1 +1 @@ -#define LAMMPS_VERSION "20 Apr 2011" +#define LAMMPS_VERSION "27 Apr 2011" From 199c005d935e49b772befeedfc2150f1dbfe7f82 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 29 Apr 2011 15:52:26 +0000 Subject: [PATCH 04/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6033 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/ASPHERE/compute_temp_asphere.cpp | 227 ++++++++++++++++++--------- src/ASPHERE/compute_temp_asphere.h | 2 +- src/ASPHERE/fix_nve_asphere.cpp | 8 +- src/USER-EFF/fix_langevin_eff.cpp | 16 +- src/compute_temp_sphere.cpp | 143 +++++++++++------ src/compute_temp_sphere.h | 2 +- src/fix_langevin.cpp | 176 +++++++++++++++++++-- src/fix_langevin.h | 6 +- src/fix_rigid.cpp | 166 ++++++++++++++++++-- src/fix_rigid.h | 8 + 10 files changed, 601 insertions(+), 153 deletions(-) diff --git a/src/ASPHERE/compute_temp_asphere.cpp b/src/ASPHERE/compute_temp_asphere.cpp index e4e1177a7e..f2d34ac72a 100755 --- a/src/ASPHERE/compute_temp_asphere.cpp +++ b/src/ASPHERE/compute_temp_asphere.cpp @@ -32,13 +32,16 @@ using namespace LAMMPS_NS; +enum{ROTATE,ALL}; + +#define INERTIA 0.2 // moment of inertia for ellipsoid + /* ---------------------------------------------------------------------- */ ComputeTempAsphere::ComputeTempAsphere(LAMMPS *lmp, int narg, char **arg) : Compute(lmp, narg, arg) { - if (narg != 3 && narg != 4) - error->all("Illegal compute temp/asphere command"); + if (narg < 3) error->all("Illegal compute temp/asphere command"); scalar_flag = vector_flag = 1; size_vector = 6; @@ -48,11 +51,24 @@ ComputeTempAsphere::ComputeTempAsphere(LAMMPS *lmp, int narg, char **arg) : tempbias = 0; id_bias = NULL; - if (narg == 4) { - tempbias = 1; - int n = strlen(arg[3]) + 1; - id_bias = new char[n]; - strcpy(id_bias,arg[3]); + mode = ALL; + + int iarg = 3; + while (iarg < narg) { + if (strcmp(arg[iarg],"bias") == 0) { + if (iarg+2 > narg) error->all("Illegal compute temp/asphere command"); + tempbias = 1; + int n = strlen(arg[iarg+1]) + 1; + id_bias = new char[n]; + strcpy(id_bias,arg[iarg+1]); + iarg += 2; + } else if (strcmp(arg[iarg],"dof") == 0) { + if (iarg+2 > narg) error->all("Illegal compute temp/asphere command"); + if (strcmp(arg[iarg+1],"rotate") == 0) mode = ROTATE; + else if (strcmp(arg[iarg+1],"all") == 0) mode = ALL; + else error->all("Illegal compute temp/asphere command"); + iarg += 2; + } else error->all("Illegal compute temp/asphere command"); } vector = new double[6]; @@ -76,8 +92,7 @@ ComputeTempAsphere::~ComputeTempAsphere() void ComputeTempAsphere::init() { - // check that all particles are finite-size - // no point particles allowed, spherical is OK + // check that all particles are finite-size, no point particles allowed int *ellipsoid = atom->ellipsoid; int *mask = atom->mask; @@ -114,18 +129,26 @@ void ComputeTempAsphere::init() void ComputeTempAsphere::dof_compute() { // 6 dof for 3d, 3 dof for 2d + // which dof are included also depends on mode // assume full rotation of extended particles // user should correct this via compute_modify if needed double natoms = group->count(igroup); - int nper = 6; - if (domain->dimension == 2) nper = 3; + int nper; + if (domain->dimension == 3) { + if (mode == ALL) nper = 6; + else nper = 3; + } else { + if (mode == ALL) nper = 3; + else nper = 1; + } dof = nper*natoms; // additional adjustments to dof - if (tempbias == 1) dof -= tbias->dof_remove(-1) * natoms; - else if (tempbias == 2) { + if (tempbias == 1) { + if (mode == ALL) dof -= tbias->dof_remove(-1) * natoms; + } else if (tempbias == 2) { int *mask = atom->mask; int nlocal = atom->nlocal; int count = 0; @@ -154,46 +177,73 @@ double ComputeTempAsphere::compute_scalar() } AtomVecEllipsoid::Bonus *bonus = avec->bonus; - int *ellipsoid = atom->ellipsoid; double **v = atom->v; double **angmom = atom->angmom; double *rmass = atom->rmass; + int *ellipsoid = atom->ellipsoid; int *mask = atom->mask; int nlocal = atom->nlocal; double *shape,*quat; double wbody[3],inertia[3]; double rot[3][3]; - double t = 0.0; - // sum translationals and rotational energy for each particle + // sum translational and rotational energy for each particle // no point particles since divide by inertia - for (int i = 0; i < nlocal; i++) - if (mask[i] & groupbit) { + double t = 0.0; - shape = bonus[ellipsoid[i]].shape; - quat = bonus[ellipsoid[i]].quat; + if (mode == ALL) { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * rmass[i]; - t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * rmass[i]; + // principal moments of inertia - // principal moments of inertia + shape = bonus[ellipsoid[i]].shape; + quat = bonus[ellipsoid[i]].quat; - inertia[0] = rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]) / 5.0; - inertia[1] = rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]) / 5.0; - inertia[2] = rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]) / 5.0; + inertia[0] = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]); + inertia[1] = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]); + inertia[2] = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]); - // wbody = angular velocity in body frame + // wbody = angular velocity in body frame - MathExtra::quat_to_mat(quat,rot); - MathExtra::transpose_matvec(rot,angmom[i],wbody); - wbody[0] /= inertia[0]; - wbody[1] /= inertia[1]; - wbody[2] /= inertia[2]; + MathExtra::quat_to_mat(quat,rot); + MathExtra::transpose_matvec(rot,angmom[i],wbody); + wbody[0] /= inertia[0]; + wbody[1] /= inertia[1]; + wbody[2] /= inertia[2]; + + t += inertia[0]*wbody[0]*wbody[0] + + inertia[1]*wbody[1]*wbody[1] + inertia[2]*wbody[2]*wbody[2]; + } - t += inertia[0]*wbody[0]*wbody[0] + - inertia[1]*wbody[1]*wbody[1] + inertia[2]*wbody[2]*wbody[2]; - } + } else { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + + // principal moments of inertia + + shape = bonus[ellipsoid[i]].shape; + quat = bonus[ellipsoid[i]].quat; + + inertia[0] = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]); + inertia[1] = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]); + inertia[2] = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]); + + // wbody = angular velocity in body frame + + MathExtra::quat_to_mat(quat,rot); + MathExtra::transpose_matvec(rot,angmom[i],wbody); + wbody[0] /= inertia[0]; + wbody[1] /= inertia[1]; + wbody[2] /= inertia[2]; + + t += inertia[0]*wbody[0]*wbody[0] + + inertia[1]*wbody[1]*wbody[1] + inertia[2]*wbody[2]*wbody[2]; + } + } if (tempbias) tbias->restore_bias_all(); @@ -217,58 +267,93 @@ void ComputeTempAsphere::compute_vector() } AtomVecEllipsoid::Bonus *bonus = avec->bonus; - int *ellipsoid = atom->ellipsoid; double **v = atom->v; double **angmom = atom->angmom; double *rmass = atom->rmass; + int *ellipsoid = atom->ellipsoid; int *mask = atom->mask; int nlocal = atom->nlocal; double *shape,*quat; - double wbody[3],inertia[3]; + double wbody[3],inertia[3],t[6]; double rot[3][3]; - double massone,t[6]; + double massone; + + // sum translational and rotational energy for each particle + // no point particles since divide by inertia + for (i = 0; i < 6; i++) t[i] = 0.0; - for (i = 0; i < nlocal; i++) - if (mask[i] & groupbit) { + if (mode == ALL) { + for (i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + massone = rmass[i]; + t[0] += massone * v[i][0]*v[i][0]; + t[1] += massone * v[i][1]*v[i][1]; + t[2] += massone * v[i][2]*v[i][2]; + t[3] += massone * v[i][0]*v[i][1]; + t[4] += massone * v[i][0]*v[i][2]; + t[5] += massone * v[i][1]*v[i][2]; + + // principal moments of inertia - shape = bonus[ellipsoid[i]].shape; - quat = bonus[ellipsoid[i]].quat; + shape = bonus[ellipsoid[i]].shape; + quat = bonus[ellipsoid[i]].quat; - // translational kinetic energy + inertia[0] = INERTIA*massone * (shape[1]*shape[1]+shape[2]*shape[2]); + inertia[1] = INERTIA*massone * (shape[0]*shape[0]+shape[2]*shape[2]); + inertia[2] = INERTIA*massone * (shape[0]*shape[0]+shape[1]*shape[1]); + + // wbody = angular velocity in body frame + + MathExtra::quat_to_mat(quat,rot); + MathExtra::transpose_matvec(rot,angmom[i],wbody); + wbody[0] /= inertia[0]; + wbody[1] /= inertia[1]; + wbody[2] /= inertia[2]; + + // rotational kinetic energy + + t[0] += inertia[0]*wbody[0]*wbody[0]; + t[1] += inertia[1]*wbody[1]*wbody[1]; + t[2] += inertia[2]*wbody[2]*wbody[2]; + t[3] += inertia[0]*wbody[0]*wbody[1]; + t[4] += inertia[1]*wbody[0]*wbody[2]; + t[5] += inertia[2]*wbody[1]*wbody[2]; + } - massone = rmass[i]; - t[0] += massone * v[i][0]*v[i][0]; - t[1] += massone * v[i][1]*v[i][1]; - t[2] += massone * v[i][2]*v[i][2]; - t[3] += massone * v[i][0]*v[i][1]; - t[4] += massone * v[i][0]*v[i][2]; - t[5] += massone * v[i][1]*v[i][2]; + } else { + for (i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + + // principal moments of inertia - // principal moments of inertia + shape = bonus[ellipsoid[i]].shape; + quat = bonus[ellipsoid[i]].quat; + massone = rmass[i]; - inertia[0] = rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]) / 5.0; - inertia[1] = rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]) / 5.0; - inertia[2] = rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]) / 5.0; - - // wbody = angular velocity in body frame - - MathExtra::quat_to_mat(quat,rot); - MathExtra::transpose_matvec(rot,angmom[i],wbody); - wbody[0] /= inertia[0]; - wbody[1] /= inertia[1]; - wbody[2] /= inertia[2]; - - // rotational kinetic energy - - t[0] += inertia[0]*wbody[0]*wbody[0]; - t[1] += inertia[1]*wbody[1]*wbody[1]; - t[2] += inertia[2]*wbody[2]*wbody[2]; - t[3] += inertia[0]*wbody[0]*wbody[1]; - t[4] += inertia[1]*wbody[0]*wbody[2]; - t[5] += inertia[2]*wbody[1]*wbody[2]; - } + inertia[0] = INERTIA*massone * (shape[1]*shape[1]+shape[2]*shape[2]); + inertia[1] = INERTIA*massone * (shape[0]*shape[0]+shape[2]*shape[2]); + inertia[2] = INERTIA*massone * (shape[0]*shape[0]+shape[1]*shape[1]); + + // wbody = angular velocity in body frame + + MathExtra::quat_to_mat(quat,rot); + MathExtra::transpose_matvec(rot,angmom[i],wbody); + wbody[0] /= inertia[0]; + wbody[1] /= inertia[1]; + wbody[2] /= inertia[2]; + + // rotational kinetic energy + + t[0] += inertia[0]*wbody[0]*wbody[0]; + t[1] += inertia[1]*wbody[1]*wbody[1]; + t[2] += inertia[2]*wbody[2]*wbody[2]; + t[3] += inertia[0]*wbody[0]*wbody[1]; + t[4] += inertia[1]*wbody[0]*wbody[2]; + t[5] += inertia[2]*wbody[1]*wbody[2]; + } + } if (tempbias) tbias->restore_bias_all(); diff --git a/src/ASPHERE/compute_temp_asphere.h b/src/ASPHERE/compute_temp_asphere.h index 19e29ebf1b..dde67a1bd5 100755 --- a/src/ASPHERE/compute_temp_asphere.h +++ b/src/ASPHERE/compute_temp_asphere.h @@ -36,7 +36,7 @@ class ComputeTempAsphere : public Compute { void restore_bias(int, double *); private: - int fix_dof; + int fix_dof,mode; double tfactor; char *id_bias; class Compute *tbias; // ptr to additional bias compute diff --git a/src/ASPHERE/fix_nve_asphere.cpp b/src/ASPHERE/fix_nve_asphere.cpp index 4f8c94e208..9e4155581f 100755 --- a/src/ASPHERE/fix_nve_asphere.cpp +++ b/src/ASPHERE/fix_nve_asphere.cpp @@ -29,6 +29,8 @@ using namespace LAMMPS_NS; +#define INERTIA 0.2 // moment of inertia for ellipsoid + /* ---------------------------------------------------------------------- */ FixNVEAsphere::FixNVEAsphere(LAMMPS *lmp, int narg, char **arg) : @@ -103,9 +105,9 @@ void FixNVEAsphere::initial_integrate(int vflag) shape = bonus[ellipsoid[i]].shape; quat = bonus[ellipsoid[i]].quat; - inertia[0] = rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]) / 5.0; - inertia[1] = rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]) / 5.0; - inertia[2] = rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]) / 5.0; + inertia[0] = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]); + inertia[1] = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]); + inertia[2] = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]); // compute omega at 1/2 step from angmom at 1/2 step and current q // update quaternion a full step via Richardson iteration diff --git a/src/USER-EFF/fix_langevin_eff.cpp b/src/USER-EFF/fix_langevin_eff.cpp index 2758ebea07..7328fdb23b 100644 --- a/src/USER-EFF/fix_langevin_eff.cpp +++ b/src/USER-EFF/fix_langevin_eff.cpp @@ -88,7 +88,9 @@ void FixLangevinEff::post_force_no_tally() f[i][0] += gamma1*v[i][0] + gamma2*(random->uniform()-0.5); f[i][1] += gamma1*v[i][1] + gamma2*(random->uniform()-0.5); f[i][2] += gamma1*v[i][2] + gamma2*(random->uniform()-0.5); - if (abs(spin[i])==1) erforce[i] += 0.75*gamma1*ervel[i] + 0.866025404*gamma2*(random->uniform()-0.5); + if (abs(spin[i])==1) + erforce[i] += 0.75*gamma1*ervel[i] + + 0.866025404*gamma2*(random->uniform()-0.5); } } } else if (which == BIAS) { @@ -105,7 +107,8 @@ void FixLangevinEff::post_force_no_tally() if (v[i][2] != 0.0) f[i][2] += gamma1*v[i][2] + gamma2*(random->uniform()-0.5); if (abs(spin[i])==1 && ervel[i] != 0.0) - erforce[i] += 0.75*gamma1*ervel[i] + 0.866025404*gamma2*(random->uniform()-0.5); + erforce[i] += 0.75*gamma1*ervel[i] + + 0.866025404*gamma2*(random->uniform()-0.5); temperature->restore_bias(i,v[i]); } } @@ -158,7 +161,8 @@ void FixLangevinEff::post_force_tally() flangevin[i][0] = gamma1*v[i][0] + gamma2*(random->uniform()-0.5); flangevin[i][1] = gamma1*v[i][1] + gamma2*(random->uniform()-0.5); flangevin[i][2] = gamma1*v[i][2] + gamma2*(random->uniform()-0.5); - erforcelangevin[i] = 0.75*gamma1*ervel[i]+0.866025404*gamma2*(random->uniform()-0.5); + erforcelangevin[i] = 0.75*gamma1*ervel[i] + + 0.866025404*gamma2*(random->uniform()-0.5); f[i][0] += flangevin[i][0]; f[i][1] += flangevin[i][1]; f[i][2] += flangevin[i][2]; @@ -175,14 +179,16 @@ void FixLangevinEff::post_force_tally() flangevin[i][0] = gamma1*v[i][0] + gamma2*(random->uniform()-0.5); flangevin[i][1] = gamma1*v[i][1] + gamma2*(random->uniform()-0.5); flangevin[i][2] = gamma1*v[i][2] + gamma2*(random->uniform()-0.5); - erforcelangevin[i] = 0.75*gamma1*ervel[i]+0.866025404*gamma2*(random->uniform()-0.5); + erforcelangevin[i] = 0.75*gamma1*ervel[i] + + 0.866025404*gamma2*(random->uniform()-0.5); if (v[i][0] != 0.0) f[i][0] += flangevin[i][0]; else flangevin[i][0] = 0.0; if (v[i][1] != 0.0) f[i][1] += flangevin[i][1]; else flangevin[i][1] = 0.0; if (v[i][2] != 0.0) f[i][2] += flangevin[i][2]; else flangevin[i][2] = 0.0; - if (abs(spin[i])==1 && ervel[i] != 0.0) erforce[i] += erforcelangevin[i]; + if (abs(spin[i])==1 && ervel[i] != 0.0) + erforce[i] += erforcelangevin[i]; temperature->restore_bias(i,v[i]); } } diff --git a/src/compute_temp_sphere.cpp b/src/compute_temp_sphere.cpp index bad55efdb8..93c9ec74aa 100644 --- a/src/compute_temp_sphere.cpp +++ b/src/compute_temp_sphere.cpp @@ -26,6 +26,8 @@ using namespace LAMMPS_NS; +enum{ROTATE,ALL}; + #define INERTIA 0.4 // moment of inertia for sphere /* ---------------------------------------------------------------------- */ @@ -33,8 +35,7 @@ using namespace LAMMPS_NS; ComputeTempSphere::ComputeTempSphere(LAMMPS *lmp, int narg, char **arg) : Compute(lmp, narg, arg) { - if (narg != 3 && narg != 4) - error->all("Illegal compute temp/sphere command"); + if (narg < 3) error->all("Illegal compute temp/sphere command"); scalar_flag = vector_flag = 1; size_vector = 6; @@ -44,11 +45,24 @@ ComputeTempSphere::ComputeTempSphere(LAMMPS *lmp, int narg, char **arg) : tempbias = 0; id_bias = NULL; - if (narg == 4) { - tempbias = 1; - int n = strlen(arg[3]) + 1; - id_bias = new char[n]; - strcpy(id_bias,arg[3]); + mode = ALL; + + int iarg = 3; + while (iarg < narg) { + if (strcmp(arg[iarg],"bias") == 0) { + if (iarg+2 > narg) error->all("Illegal compute temp/sphere command"); + tempbias = 1; + int n = strlen(arg[iarg+1]) + 1; + id_bias = new char[n]; + strcpy(id_bias,arg[iarg+1]); + iarg += 2; + } else if (strcmp(arg[iarg],"dof") == 0) { + if (iarg+2 > narg) error->all("Illegal compute temp/sphere command"); + if (strcmp(arg[iarg+1],"rotate") == 0) mode = ROTATE; + else if (strcmp(arg[iarg+1],"all") == 0) mode = ALL; + else error->all("Illegal compute temp/sphere command"); + iarg += 2; + } else error->all("Illegal compute temp/sphere command"); } vector = new double[6]; @@ -100,27 +114,34 @@ void ComputeTempSphere::dof_compute() // 6 or 3 dof for extended/point particles for 3d // 3 or 2 dof for extended/point particles for 2d + // which dof are included also depends on mode // assume full rotation of extended particles // user should correct this via compute_modify if needed - int dimension = domain->dimension; - double *radius = atom->radius; int *mask = atom->mask; int nlocal = atom->nlocal; count = 0; - if (dimension == 3) { + if (domain->dimension == 3) { for (int i = 0; i < nlocal; i++) if (mask[i] & groupbit) { - if (radius[i] == 0.0) count += 3; - else count += 6; + if (radius[i] == 0.0) { + if (mode == ALL) count += 3; + } else { + if (mode == ALL) count += 6; + else count += 3; + } } } else { for (int i = 0; i < nlocal; i++) if (mask[i] & groupbit) { - if (radius[i] == 0.0) count += 2; - else count += 3; + if (radius[i] == 0.0) { + if (mode == ALL) count += 2; + } else { + if (mode == ALL) count += 3; + else count += 1; + } } } @@ -130,28 +151,38 @@ void ComputeTempSphere::dof_compute() // additional adjustments to dof if (tempbias == 1) { - double natoms = group->count(igroup); - dof -= tbias->dof_remove(-1) * natoms; + if (mode == ALL) { + double natoms = group->count(igroup); + dof -= tbias->dof_remove(-1) * natoms; + } } else if (tempbias == 2) { int *mask = atom->mask; int nlocal = atom->nlocal; count = 0; - if (dimension == 3) { + if (domain->dimension == 3) { for (int i = 0; i < nlocal; i++) if (mask[i] & groupbit) { if (tbias->dof_remove(i)) { - if (radius[i] == 0.0) count += 3; - else count += 6; + if (radius[i] == 0.0) { + if (mode == ALL) count += 3; + } else { + if (mode == ALL) count += 6; + else count += 3; + } } } } else { for (int i = 0; i < nlocal; i++) if (mask[i] & groupbit) { if (tbias->dof_remove(i)) { - if (radius[i] == 0.0) count += 2; - else count += 3; + if (radius[i] == 0.0) { + if (mode == ALL) count += 2; + } else { + if (mode == ALL) count += 3; + else count += 1; + } } } } @@ -187,12 +218,19 @@ double ComputeTempSphere::compute_scalar() double t = 0.0; - for (int i = 0; i < nlocal; i++) - if (mask[i] & groupbit) { - t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * rmass[i]; - t += (omega[i][0]*omega[i][0] + omega[i][1]*omega[i][1] + - omega[i][2]*omega[i][2]) * INERTIA*radius[i]*radius[i]*rmass[i]; - } + if (mode == ALL) { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * rmass[i]; + t += (omega[i][0]*omega[i][0] + omega[i][1]*omega[i][1] + + omega[i][2]*omega[i][2]) * INERTIA*rmass[i]*radius[i]*radius[i]; + } + } else { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + t += (omega[i][0]*omega[i][0] + omega[i][1]*omega[i][1] + + omega[i][2]*omega[i][2]) * INERTIA*rmass[i]*radius[i]*radius[i]; + } if (tempbias) tbias->restore_bias_all(); @@ -225,25 +263,38 @@ void ComputeTempSphere::compute_vector() double massone,inertiaone,t[6]; for (int i = 0; i < 6; i++) t[i] = 0.0; - for (int i = 0; i < nlocal; i++) - if (mask[i] & groupbit) { - massone = rmass[i]; - t[0] += massone * v[i][0]*v[i][0]; - t[1] += massone * v[i][1]*v[i][1]; - t[2] += massone * v[i][2]*v[i][2]; - t[3] += massone * v[i][0]*v[i][1]; - t[4] += massone * v[i][0]*v[i][2]; - t[5] += massone * v[i][1]*v[i][2]; - - inertiaone = INERTIA*radius[i]*radius[i]*rmass[i]; - t[0] += inertiaone * omega[i][0]*omega[i][0]; - t[1] += inertiaone * omega[i][1]*omega[i][1]; - t[2] += inertiaone * omega[i][2]*omega[i][2]; - t[3] += inertiaone * omega[i][0]*omega[i][1]; - t[4] += inertiaone * omega[i][0]*omega[i][2]; - t[5] += inertiaone * omega[i][1]*omega[i][2]; - } - + if (mode == ALL) { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + massone = rmass[i]; + t[0] += massone * v[i][0]*v[i][0]; + t[1] += massone * v[i][1]*v[i][1]; + t[2] += massone * v[i][2]*v[i][2]; + t[3] += massone * v[i][0]*v[i][1]; + t[4] += massone * v[i][0]*v[i][2]; + t[5] += massone * v[i][1]*v[i][2]; + + inertiaone = INERTIA*rmass[i]*radius[i]*radius[i]; + t[0] += inertiaone * omega[i][0]*omega[i][0]; + t[1] += inertiaone * omega[i][1]*omega[i][1]; + t[2] += inertiaone * omega[i][2]*omega[i][2]; + t[3] += inertiaone * omega[i][0]*omega[i][1]; + t[4] += inertiaone * omega[i][0]*omega[i][2]; + t[5] += inertiaone * omega[i][1]*omega[i][2]; + } + } else { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + inertiaone = INERTIA*rmass[i]*radius[i]*radius[i]; + t[0] += inertiaone * omega[i][0]*omega[i][0]; + t[1] += inertiaone * omega[i][1]*omega[i][1]; + t[2] += inertiaone * omega[i][2]*omega[i][2]; + t[3] += inertiaone * omega[i][0]*omega[i][1]; + t[4] += inertiaone * omega[i][0]*omega[i][2]; + t[5] += inertiaone * omega[i][1]*omega[i][2]; + } + } + if (tempbias) tbias->restore_bias_all(); MPI_Allreduce(t,vector,6,MPI_DOUBLE,MPI_SUM,world); diff --git a/src/compute_temp_sphere.h b/src/compute_temp_sphere.h index 86285061bd..c0b29dce59 100644 --- a/src/compute_temp_sphere.h +++ b/src/compute_temp_sphere.h @@ -36,7 +36,7 @@ class ComputeTempSphere : public Compute { void restore_bias(int, double *); private: - int fix_dof; + int fix_dof,mode; double tfactor; double *inertia; char *id_bias; diff --git a/src/fix_langevin.cpp b/src/fix_langevin.cpp index 37da93f90d..0d233126ef 100644 --- a/src/fix_langevin.cpp +++ b/src/fix_langevin.cpp @@ -20,7 +20,9 @@ #include "string.h" #include "stdlib.h" #include "fix_langevin.h" +#include "math_extra.h" #include "atom.h" +#include "atom_vec_ellipsoid.h" #include "force.h" #include "update.h" #include "modify.h" @@ -38,6 +40,9 @@ using namespace LAMMPS_NS; enum{NOBIAS,BIAS}; +#define SINERTIA 0.4 // moment of inertia for sphere +#define EINERTIA 0.2 // moment of inertia for ellipsoid + /* ---------------------------------------------------------------------- */ FixLangevin::FixLangevin(LAMMPS *lmp, int narg, char **arg) : @@ -71,6 +76,7 @@ FixLangevin::FixLangevin(LAMMPS *lmp, int narg, char **arg) : // optional args for (int i = 1; i <= atom->ntypes; i++) ratio[i] = 1.0; + oflag = aflag = 0; tally = 0; zeroflag = 0; @@ -96,9 +102,29 @@ FixLangevin::FixLangevin(LAMMPS *lmp, int narg, char **arg) : else if (strcmp(arg[iarg+1],"yes") == 0) zeroflag = 1; else error->all("Illegal fix langevin command"); iarg += 2; + } else if (strcmp(arg[iarg],"omega") == 0) { + if (iarg+2 > narg) error->all("Illegal fix langevin command"); + if (strcmp(arg[iarg+1],"no") == 0) oflag = 0; + else if (strcmp(arg[iarg+1],"yes") == 0) oflag = 1; + else error->all("Illegal fix langevin command"); + iarg += 2; + } else if (strcmp(arg[iarg],"angmom") == 0) { + if (iarg+2 > narg) error->all("Illegal fix langevin command"); + if (strcmp(arg[iarg+1],"no") == 0) aflag = 0; + else if (strcmp(arg[iarg+1],"yes") == 0) aflag = 1; + else error->all("Illegal fix langevin command"); + iarg += 2; } else error->all("Illegal fix langevin command"); } + // error check + + if (aflag) { + avec = (AtomVecEllipsoid *) atom->style_match("ellipsoid"); + if (!avec) + error->all("Fix langevin angmom requires atom style ellipsoid"); + } + // set temperature = NULL, user can override via fix_modify if wants bias id_temp = NULL; @@ -140,6 +166,35 @@ int FixLangevin::setmask() void FixLangevin::init() { + if (oflag && !atom->sphere_flag) + error->all("Fix langevin omega require atom style sphere"); + if (aflag && !atom->ellipsoid_flag) + error->all("Fix langevin angmom require atom style ellipsoid"); + + // if oflag or aflag set, check that all group particles are finite-size + + if (oflag) { + double *radius = atom->radius; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + if (radius[i] == 0.0) + error->one("Fix langevin omega requires extended particles"); + } + + if (aflag) { + int *ellipsoid = atom->ellipsoid; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + if (ellipsoid[i] < 0) + error->one("Fix langevin angmom requires extended particles"); + } + // set force prefactors if (!atom->rmass) { @@ -219,6 +274,11 @@ void FixLangevin::post_force_no_tally() double fran[3],fsum[3],fsumall[3]; fsum[0] = fsum[1] = fsum[2] = 0.0; bigint count; + + double boltz = force->boltz; + double dt = update->dt; + double mvv2e = force->mvv2e; + double ftm2v = force->ftm2v; if (zeroflag) { count = group->count(igroup); @@ -227,11 +287,6 @@ void FixLangevin::post_force_no_tally() } if (rmass) { - double boltz = force->boltz; - double dt = update->dt; - double mvv2e = force->mvv2e; - double ftm2v = force->ftm2v; - if (which == NOBIAS) { for (int i = 0; i < nlocal; i++) { if (mask[i] & groupbit) { @@ -280,7 +335,6 @@ void FixLangevin::post_force_no_tally() } else { if (which == NOBIAS) { - for (int i = 0; i < nlocal; i++) { if (mask[i] & groupbit) { gamma1 = gfactor1[type[i]]; @@ -295,7 +349,6 @@ void FixLangevin::post_force_no_tally() fsum[1] += fran[1]; fsum[2] += fran[2]; } - } } else if (which == BIAS) { @@ -338,6 +391,11 @@ void FixLangevin::post_force_no_tally() } } } + + // thermostat omega and angmom + + if (oflag) omega_thermostat(tsqrt); + if (aflag) angmom_thermostat(tsqrt); } /* ---------------------------------------------------------------------- */ @@ -373,12 +431,12 @@ void FixLangevin::post_force_tally() // test v = 0 since some computes mask non-participating atoms via v = 0 // and added force has extra term not multiplied by v = 0 - if (rmass) { - double boltz = force->boltz; - double dt = update->dt; - double mvv2e = force->mvv2e; - double ftm2v = force->ftm2v; + double boltz = force->boltz; + double dt = update->dt; + double mvv2e = force->mvv2e; + double ftm2v = force->ftm2v; + if (rmass) { if (which == NOBIAS) { for (int i = 0; i < nlocal; i++) { if (mask[i] & groupbit) { @@ -454,6 +512,100 @@ void FixLangevin::post_force_tally() } } } + + // thermostat omega and angmom + + if (oflag) omega_thermostat(tsqrt); + if (aflag) angmom_thermostat(tsqrt); +} + +/* ---------------------------------------------------------------------- + thermostat rotational dof via omega +------------------------------------------------------------------------- */ + +void FixLangevin::omega_thermostat(double tsqrt) +{ + double gamma1,gamma2; + + double boltz = force->boltz; + double dt = update->dt; + double mvv2e = force->mvv2e; + double ftm2v = force->ftm2v; + + double **torque = atom->torque; + double **omega = atom->omega; + double *radius = atom->radius; + double *rmass = atom->rmass; + int *mask = atom->mask; + int *type = atom->type; + int nlocal = atom->nlocal; + + double tran[3]; + double inertiaone; + + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + inertiaone = SINERTIA*radius[i]*radius[i]*rmass[i]; + gamma1 = -inertiaone / t_period / ftm2v; + gamma2 = sqrt(inertiaone) * sqrt(24.0*boltz/t_period/dt/mvv2e) / ftm2v; + gamma1 *= 1.0/ratio[type[i]]; + gamma2 *= 1.0/sqrt(ratio[type[i]]) * tsqrt; + tran[0] = gamma2*(random->uniform()-0.5); + tran[1] = gamma2*(random->uniform()-0.5); + tran[2] = gamma2*(random->uniform()-0.5); + torque[i][0] += gamma1*omega[i][0] + tran[0]; + torque[i][1] += gamma1*omega[i][1] + tran[1]; + torque[i][2] += gamma1*omega[i][2] + tran[2]; + } + } +} + +/* ---------------------------------------------------------------------- + thermostat rotational dof via angmom +------------------------------------------------------------------------- */ + +void FixLangevin::angmom_thermostat(double tsqrt) +{ + double gamma1,gamma2; + + double boltz = force->boltz; + double dt = update->dt; + double mvv2e = force->mvv2e; + double ftm2v = force->ftm2v; + + AtomVecEllipsoid::Bonus *bonus = avec->bonus; + double **torque = atom->torque; + double **angmom = atom->angmom; + double *rmass = atom->rmass; + int *ellipsoid = atom->ellipsoid; + int *mask = atom->mask; + int *type = atom->type; + int nlocal = atom->nlocal; + + double inertia[3],wbody[3],omega[3],tran[3],rot[3][3]; + double *shape,*quat; + + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + shape = bonus[ellipsoid[i]].shape; + inertia[0] = EINERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]); + inertia[1] = EINERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]); + inertia[2] = EINERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]); + quat = bonus[ellipsoid[i]].quat; + MathExtra::mq_to_omega(angmom[i],quat,inertia,omega); + + gamma1 = -1.0 / t_period / ftm2v; + gamma2 = sqrt(24.0*boltz/t_period/dt/mvv2e) / ftm2v; + gamma1 *= 1.0/ratio[type[i]]; + gamma2 *= 1.0/sqrt(ratio[type[i]]) * tsqrt; + tran[0] = sqrt(inertia[0])*gamma2*(random->uniform()-0.5); + tran[1] = sqrt(inertia[1])*gamma2*(random->uniform()-0.5); + tran[2] = sqrt(inertia[2])*gamma2*(random->uniform()-0.5); + torque[i][0] += inertia[0]*gamma1*omega[0] + tran[0]; + torque[i][1] += inertia[1]*gamma1*omega[1] + tran[1]; + torque[i][2] += inertia[2]*gamma1*omega[2] + tran[2]; + } + } } /* ---------------------------------------------------------------------- diff --git a/src/fix_langevin.h b/src/fix_langevin.h index f325befd46..735f6cdcf0 100644 --- a/src/fix_langevin.h +++ b/src/fix_langevin.h @@ -41,11 +41,13 @@ class FixLangevin : public Fix { double memory_usage(); protected: - int which,tally,zeroflag; + int which,tally,zeroflag,oflag,aflag; double t_start,t_stop,t_period; double *gfactor1,*gfactor2,*ratio; double energy,energy_onestep; + class AtomVecEllipsoid *avec; + int nmax; double **flangevin; @@ -57,6 +59,8 @@ class FixLangevin : public Fix { virtual void post_force_no_tally(); virtual void post_force_tally(); + void omega_thermostat(double); + void angmom_thermostat(double); }; } diff --git a/src/fix_rigid.cpp b/src/fix_rigid.cpp index 4d0686409d..6d43b9949e 100644 --- a/src/fix_rigid.cpp +++ b/src/fix_rigid.cpp @@ -25,6 +25,7 @@ #include "modify.h" #include "group.h" #include "comm.h" +#include "random_mars.h" #include "force.h" #include "output.h" #include "memory.h" @@ -45,11 +46,16 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) : { int i,ibody; + scalar_flag = 1; + extscalar = 0; time_integrate = 1; rigid_flag = 1; virial_flag = 1; create_attribute = 1; + MPI_Comm_rank(world,&me); + MPI_Comm_size(world,&nprocs); + // perform initial allocation of atom-based arrays // register with Atom class @@ -193,12 +199,14 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) : memory->create(imagebody,nbody,"rigid:imagebody"); memory->create(fflag,nbody,3,"rigid:fflag"); memory->create(tflag,nbody,3,"rigid:tflag"); + memory->create(langextra,nbody,6,"rigid:langextra"); memory->create(sum,nbody,6,"rigid:sum"); memory->create(all,nbody,6,"rigid:all"); memory->create(remapflag,nbody,4,"rigid:remapflag"); // initialize force/torque flags to default = 1.0 + // for 2d: fz, tx, ty = 0.0 array_flag = 1; size_array_rows = nbody; @@ -209,10 +217,13 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) : for (i = 0; i < nbody; i++) { fflag[i][0] = fflag[i][1] = fflag[i][2] = 1.0; tflag[i][0] = tflag[i][1] = tflag[i][2] = 1.0; + if (domain->dimension == 2) fflag[i][2] = tflag[i][0] = tflag[i][1] = 0.0; } // parse optional args + int seed; + langflag = 0; tempflag = 0; pressflag = 0; t_chain = 10; @@ -238,6 +249,9 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) : else if (strcmp(arg[iarg+4],"on") == 0) zflag = 1.0; else error->all("Illegal fix rigid command"); + if (domain->dimension == 2 && zflag == 1.0) + error->all("Fix rigid z force cannot be on for 2d simulation"); + int count = 0; for (int m = mlo; m <= mhi; m++) { fflag[m-1][0] = xflag; @@ -266,6 +280,9 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) : else if (strcmp(arg[iarg+4],"on") == 0) zflag = 1.0; else error->all("Illegal fix rigid command"); + if (domain->dimension == 2 && (xflag == 1.0 || yflag == 1.0)) + error->all("Fix rigid xy torque cannot be on for 2d simulation"); + int count = 0; for (int m = mlo; m <= mhi; m++) { tflag[m-1][0] = xflag; @@ -277,10 +294,24 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) : iarg += 5; + } else if (strcmp(arg[iarg],"langevin") == 0) { + if (iarg+5 > narg) error->all("Illegal fix rigid command"); + if (strcmp(style,"rigid") != 0 && strcmp(style,"rigid/nve") != 0) + error->all("Illegal fix rigid command"); + langflag = 1; + t_start = atof(arg[iarg+1]); + t_stop = atof(arg[iarg+2]); + t_period = atof(arg[iarg+3]); + seed = atoi(arg[iarg+4]); + if (t_period <= 0.0) + error->all("Fix rigid langevin period must be > 0.0"); + if (seed <= 0) error->all("Illegal fix rigid command"); + iarg += 5; + } else if (strcmp(arg[iarg],"temp") == 0) { if (iarg+4 > narg) error->all("Illegal fix rigid command"); if (strcmp(style,"rigid/nvt") != 0 && strcmp(style,"rigid/npt") != 0) - error->all("Illegal fix/rigid command"); + error->all("Illegal fix rigid command"); tempflag = 1; t_start = atof(arg[iarg+1]); t_stop = atof(arg[iarg+2]); @@ -290,7 +321,7 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) : } else if (strcmp(arg[iarg],"press") == 0) { if (iarg+4 > narg) error->all("Illegal fix rigid command"); if (strcmp(style,"rigid/npt") != 0) - error->all("Illegal fix/rigid command"); + error->all("Illegal fix rigid command"); pressflag = 1; p_start = atof(arg[iarg+1]); p_stop = atof(arg[iarg+2]); @@ -300,7 +331,7 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) : } else if (strcmp(arg[iarg],"tparam") == 0) { if (iarg+4 > narg) error->all("Illegal fix rigid command"); if (strcmp(style,"rigid/nvt") != 0) - error->all("Illegal fix/rigid command"); + error->all("Illegal fix rigid command"); t_chain = atoi(arg[iarg+1]); t_iter = atoi(arg[iarg+2]); t_order = atoi(arg[iarg+3]); @@ -309,13 +340,18 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) : } else if (strcmp(arg[iarg],"pparam") == 0) { if (iarg+2 > narg) error->all("Illegal fix rigid command"); if (strcmp(style,"rigid/npt") != 0) - error->all("Illegal fix/rigid command"); + error->all("Illegal fix rigid command"); p_chain = atoi(arg[iarg+1]); iarg += 2; } else error->all("Illegal fix rigid command"); } + // initialize Marsaglia RNG with processor-unique seed + + if (langflag) random = new RanMars(lmp,seed + me); + else random = NULL; + // initialize vector output quantities in case accessed before run for (i = 0; i < nbody; i++) { @@ -369,7 +405,7 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) : int nsum = 0; for (ibody = 0; ibody < nbody; ibody++) nsum += nrigid[ibody]; - if (comm->me == 0) { + if (me == 0) { if (screen) fprintf(screen,"%d rigid bodies with %d atoms\n",nbody,nsum); if (logfile) fprintf(logfile,"%d rigid bodies with %d atoms\n",nbody,nsum); } @@ -383,6 +419,8 @@ FixRigid::~FixRigid() atom->delete_callback(id,0); + delete random; + // delete locally stored arrays memory->destroy(body); @@ -409,6 +447,7 @@ FixRigid::~FixRigid() memory->destroy(imagebody); memory->destroy(fflag); memory->destroy(tflag); + memory->destroy(langextra); memory->destroy(sum); memory->destroy(all); @@ -422,6 +461,7 @@ int FixRigid::setmask() int mask = 0; mask |= INITIAL_INTEGRATE; mask |= FINAL_INTEGRATE; + if (langflag) mask |= POST_FORCE; mask |= PRE_NEIGHBOR; mask |= INITIAL_INTEGRATE_RESPA; mask |= FINAL_INTEGRATE_RESPA; @@ -441,7 +481,7 @@ void FixRigid::init() int count = 0; for (int i = 0; i < modify->nfix; i++) if (strcmp(modify->fix[i]->style,"rigid") == 0) count++; - if (count > 1 && comm->me == 0) error->warning("More than one fix rigid"); + if (count > 1 && me == 0) error->warning("More than one fix rigid"); // error if npt,nph fix comes before rigid fix @@ -855,6 +895,15 @@ void FixRigid::init() fabs(all[ibody][5]/norm) > TOLERANCE) error->all("Fix rigid: Bad principal moments"); } + + // temperature scale factor + + double ndof = 0.0; + for (ibody = 0; ibody < nbody; ibody++) { + ndof += fflag[ibody][0] + fflag[ibody][1] + fflag[ibody][2]; + ndof += tflag[ibody][0] + tflag[ibody][1] + tflag[ibody][2]; + } + tfactor = force->mvv2e / (ndof * force->boltz); } /* ---------------------------------------------------------------------- */ @@ -998,6 +1047,13 @@ void FixRigid::setup(int vflag) torque[ibody][2] = all[ibody][5]; } + // zero langextra in case Langevin thermostat not used + // no point to calling post_force() here since langextra + // is only added to fcm/torque in final_integrate() + + for (ibody = 0; ibody < nbody; ibody++) + for (i = 0; i < 6; i++) langextra[ibody][i] = 0.0; + // virial setup before call to set_v if (vflag) v_setup(vflag); @@ -1072,6 +1128,50 @@ void FixRigid::initial_integrate(int vflag) set_xv(); } +/* ---------------------------------------------------------------------- + apply Langevin thermostat to all 6 DOF of rigid bodies + computed by proc 0, broadcast to other procs + unlike fix langevin, this stores extra force in extra arrays, + which are added in when final_integrate() calculates a new fcm/torque +------------------------------------------------------------------------- */ + +void FixRigid::post_force(int vflag) +{ + if (me == 0) { + double gamma1,gamma2; + + double delta = update->ntimestep - update->beginstep; + delta /= update->endstep - update->beginstep; + double t_target = t_start + delta * (t_stop-t_start); + double tsqrt = sqrt(t_target); + + double boltz = force->boltz; + double dt = update->dt; + double mvv2e = force->mvv2e; + double ftm2v = force->ftm2v; + + for (int i = 0; i < nbody; i++) { + gamma1 = -masstotal[i] / t_period / ftm2v; + gamma2 = sqrt(masstotal[i]) * tsqrt * + sqrt(24.0*boltz/t_period/dt/mvv2e) / ftm2v; + langextra[i][0] = gamma1*vcm[i][0] + gamma2*(random->uniform()-0.5); + langextra[i][1] = gamma1*vcm[i][1] + gamma2*(random->uniform()-0.5); + langextra[i][2] = gamma1*vcm[i][2] + gamma2*(random->uniform()-0.5); + + gamma1 = -1.0 / t_period / ftm2v; + gamma2 = tsqrt * sqrt(24.0*boltz/t_period/dt/mvv2e) / ftm2v; + langextra[i][3] = inertia[i][0]*gamma1*omega[i][0] + + sqrt(inertia[i][0])*gamma2*(random->uniform()-0.5); + langextra[i][4] = inertia[i][1]*gamma1*omega[i][1] + + sqrt(inertia[i][1])*gamma2*(random->uniform()-0.5); + langextra[i][5] = inertia[i][2]*gamma1*omega[i][2] + + sqrt(inertia[i][2])*gamma2*(random->uniform()-0.5); + } + } + + MPI_Bcast(&langextra[0][0],6*nbody,MPI_DOUBLE,0,world); +} + /* ---------------------------------------------------------------------- */ void FixRigid::final_integrate() @@ -1150,13 +1250,17 @@ void FixRigid::final_integrate() MPI_Allreduce(sum[0],all[0],6*nbody,MPI_DOUBLE,MPI_SUM,world); + // update vcm and angmom + // include Langevin thermostat forces + // fflag,tflag = 0 for some dimensions in 2d + for (ibody = 0; ibody < nbody; ibody++) { - fcm[ibody][0] = all[ibody][0]; - fcm[ibody][1] = all[ibody][1]; - fcm[ibody][2] = all[ibody][2]; - torque[ibody][0] = all[ibody][3]; - torque[ibody][1] = all[ibody][4]; - torque[ibody][2] = all[ibody][5]; + fcm[ibody][0] = all[ibody][0] + langextra[ibody][0]; + fcm[ibody][1] = all[ibody][1] + langextra[ibody][1]; + fcm[ibody][2] = all[ibody][2] + langextra[ibody][2]; + torque[ibody][0] = all[ibody][3] + langextra[ibody][3]; + torque[ibody][1] = all[ibody][4] + langextra[ibody][4]; + torque[ibody][2] = all[ibody][5] + langextra[ibody][5]; // update vcm by 1/2 step @@ -1360,7 +1464,7 @@ int FixRigid::dof(int igroup) if (nall[ibody]+mall[ibody] > 0 && nall[ibody]+mall[ibody] != nrigid[ibody]) flag = 1; } - if (flag && comm->me == 0) + if (flag && me == 0) error->warning("Computing temperature of portions of rigid bodies"); // remove appropriate DOFs for each rigid body wholly in temperature group @@ -1834,6 +1938,42 @@ void FixRigid::reset_dt() dtq = 0.5 * update->dt; } +/* ---------------------------------------------------------------------- + return temperature of collection of rigid bodies + non-active DOF are removed by fflag/tflag and in tfactor +------------------------------------------------------------------------- */ + +double FixRigid::compute_scalar() +{ + double wbody[3],rot[3][3]; + + double t = 0.0; + + for (int i = 0; i < nbody; i++) { + t += masstotal[i] * (fflag[i][0]*vcm[i][0]*vcm[i][0] + + fflag[i][1]*vcm[i][1]*vcm[i][1] + \ + fflag[i][2]*vcm[i][2]*vcm[i][2]); + + // wbody = angular velocity in body frame + + MathExtra::quat_to_mat(quat[i],rot); + MathExtra::transpose_matvec(rot,angmom[i],wbody); + if (inertia[i][0] == 0.0) wbody[0] = 0.0; + else wbody[0] /= inertia[i][0]; + if (inertia[i][1] == 0.0) wbody[1] = 0.0; + else wbody[1] /= inertia[i][1]; + if (inertia[i][2] == 0.0) wbody[2] = 0.0; + else wbody[2] /= inertia[i][2]; + + t += tflag[i][0]*inertia[i][0]*wbody[0]*wbody[0] + + tflag[i][1]*inertia[i][1]*wbody[1]*wbody[1] + + tflag[i][2]*inertia[i][2]*wbody[2]*wbody[2]; + } + + t *= tfactor; + return t; +} + /* ---------------------------------------------------------------------- return attributes of a rigid body 15 values per body diff --git a/src/fix_rigid.h b/src/fix_rigid.h index 3aa343015a..06121ad47a 100644 --- a/src/fix_rigid.h +++ b/src/fix_rigid.h @@ -32,9 +32,11 @@ class FixRigid : public Fix { virtual void init(); virtual void setup(int); virtual void initial_integrate(int); + void post_force(int); virtual void final_integrate(); void initial_integrate_respa(int, int, int); void final_integrate_respa(int, int); + virtual double compute_scalar(); double memory_usage(); void grow_arrays(int); @@ -50,6 +52,7 @@ class FixRigid : public Fix { double compute_array(int, int); protected: + int me,nprocs; double dtv,dtf,dtq; double *step_respa; int triclinic; @@ -70,6 +73,7 @@ class FixRigid : public Fix { int *imagebody; // image flags of xcm of each rigid body double **fflag; // flag for on/off of center-of-mass force double **tflag; // flag for on/off of center-of-mass torque + double **langextra; // Langevin thermostat forces and torques int *body; // which body each atom is part of (-1 if none) double **displace; // displacement of each atom in body coords @@ -85,6 +89,9 @@ class FixRigid : public Fix { double **qorient; // rotation state of ext particle wrt rigid body double **dorient; // orientation of dipole mu wrt rigid body + double tfactor; // scale factor on temperature of rigid bodies + int langflag; // 0/1 = no/yes Langevin thermostat + int tempflag; // NVT settings double t_start,t_stop; double t_period,t_freq; @@ -95,6 +102,7 @@ class FixRigid : public Fix { double p_period,p_freq; int p_chain; + class RanMars *random; class AtomVecEllipsoid *avec_ellipsoid; // bitmasks for eflags From 72644dcb8d26290fcc21da6992876b810be9b737 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 29 Apr 2011 16:27:56 +0000 Subject: [PATCH 05/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6034 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- doc/compute_temp_asphere.html | 52 ++++++++++++++++------ doc/compute_temp_asphere.txt | 47 ++++++++++++++------ doc/compute_temp_sphere.html | 56 ++++++++++++++++------- doc/compute_temp_sphere.txt | 51 ++++++++++++++------- doc/fix_langevin.html | 38 +++++++++++++--- doc/fix_langevin.txt | 37 ++++++++++++--- doc/fix_rigid.html | 84 +++++++++++++++++++++++++---------- doc/fix_rigid.txt | 84 +++++++++++++++++++++++++---------- 8 files changed, 333 insertions(+), 116 deletions(-) diff --git a/doc/compute_temp_asphere.html b/doc/compute_temp_asphere.html index 3b29b68e74..daaad528a9 100644 --- a/doc/compute_temp_asphere.html +++ b/doc/compute_temp_asphere.html @@ -13,16 +13,29 @@

Syntax:

-
compute ID group-ID temp/asphere bias-ID 
+
compute ID group-ID temp/asphere keyword value ... 
 
-
  • ID, group-ID are documented in compute command -
  • temp/asphere = style name of this compute command -
  • bias-ID = ID of a temperature compute that removes a velocity bias (optional) +
    • ID, group-ID are documented in compute command + +
    • temp/asphere = style name of this compute command + +
    • zero or more keyword/value pairs may be appended + +
    • keyword = bias or dof + +
        bias value = bias-IDuniform or gaussian
      +    bias-ID = ID of a temperature compute that removes a velocity bias
      +  dof value = all or rotate
      +    all = compute temperature of translational and rotational degrees of freedom
      +    rotate = compute temperature of just rotational degrees of freedom 
      +
      +

    Examples:

    compute 1 all temp/asphere
    -compute myTemp mobile temp/asphere tempCOM 
    +compute myTemp mobile temp/asphere bias tempCOM
    +compute myTemp mobile temp/asphere dof rotate 
     

    Description:

    @@ -75,15 +88,6 @@ vector are ordered xx, yy, zz, xy, xz, yz. constant for the duration of the run; use the dynamic option of the compute_modify command if this is not the case.

    -

    If a bias-ID is specified it must be the ID of a temperature compute -that removes a "bias" velocity from each atom. This allows compute -temp/sphere to compute its thermal temperature after the translational -kinetic energy components have been altered in a prescribed way, -e.g. to remove a velocity profile. Thermostats that use this compute -will work with this bias term. See the doc pages for individual -computes that calculate a temperature and the doc pages for fixes that -perform thermostatting for more details. -

    This compute subtracts out translational degrees-of-freedom due to fixes that constrain molecular motion, such as fix shake and fix rigid. This means the @@ -96,6 +100,26 @@ be altered using the extra option of the discussion of different ways to compute temperature and perform thermostatting.

    +
    + +

    The keyword/value option pairs are used in the following ways. +

    +

    For the bias keyword, bias-ID refers to the ID of a temperature +compute that removes a "bias" velocity from each atom. This allows +compute temp/sphere to compute its thermal temperature after the +translational kinetic energy components have been altered in a +prescribed way, e.g. to remove a velocity profile. Thermostats that +use this compute will work with this bias term. See the doc pages for +individual computes that calculate a temperature and the doc pages for +fixes that perform thermostatting for more details. +

    +

    For the dof keyword, a setting of all calculates a temperature +that includes both translational and rotational degrees of freedom. A +setting of rotate calculates a temperature that includes only +rotational degrees of freedom. +

    +
    +

    Output info:

    This compute calculates a global scalar (the temperature) and a global diff --git a/doc/compute_temp_asphere.txt b/doc/compute_temp_asphere.txt index b22256fbe8..cdd8870981 100755 --- a/doc/compute_temp_asphere.txt +++ b/doc/compute_temp_asphere.txt @@ -10,16 +10,24 @@ compute temp/asphere command :h3 [Syntax:] -compute ID group-ID temp/asphere bias-ID :pre +compute ID group-ID temp/asphere keyword value ... :pre -ID, group-ID are documented in "compute"_compute.html command -temp/asphere = style name of this compute command -bias-ID = ID of a temperature compute that removes a velocity bias (optional) :ul +ID, group-ID are documented in "compute"_compute.html command :ulb,l +temp/asphere = style name of this compute command :l +zero or more keyword/value pairs may be appended :l +keyword = {bias} or {dof} :l + {bias} value = bias-ID{uniform} or {gaussian} + bias-ID = ID of a temperature compute that removes a velocity bias + {dof} value = {all} or {rotate} + all = compute temperature of translational and rotational degrees of freedom + rotate = compute temperature of just rotational degrees of freedom :pre +:ule [Examples:] compute 1 all temp/asphere -compute myTemp mobile temp/asphere tempCOM :pre +compute myTemp mobile temp/asphere bias tempCOM +compute myTemp mobile temp/asphere dof rotate :pre [Description:] @@ -72,15 +80,6 @@ The number of atoms contributing to the temperature is assumed to be constant for the duration of the run; use the {dynamic} option of the "compute_modify"_compute_modify.html command if this is not the case. -If a {bias-ID} is specified it must be the ID of a temperature compute -that removes a "bias" velocity from each atom. This allows compute -temp/sphere to compute its thermal temperature after the translational -kinetic energy components have been altered in a prescribed way, -e.g. to remove a velocity profile. Thermostats that use this compute -will work with this bias term. See the doc pages for individual -computes that calculate a temperature and the doc pages for fixes that -perform thermostatting for more details. - This compute subtracts out translational degrees-of-freedom due to fixes that constrain molecular motion, such as "fix shake"_fix_shake.html and "fix rigid"_fix_rigid.html. This means the @@ -93,6 +92,26 @@ See "this howto section"_Section_howto.html#4_16 of the manual for a discussion of different ways to compute temperature and perform thermostatting. +:line + +The keyword/value option pairs are used in the following ways. + +For the {bias} keyword, {bias-ID} refers to the ID of a temperature +compute that removes a "bias" velocity from each atom. This allows +compute temp/sphere to compute its thermal temperature after the +translational kinetic energy components have been altered in a +prescribed way, e.g. to remove a velocity profile. Thermostats that +use this compute will work with this bias term. See the doc pages for +individual computes that calculate a temperature and the doc pages for +fixes that perform thermostatting for more details. + +For the {dof} keyword, a setting of {all} calculates a temperature +that includes both translational and rotational degrees of freedom. A +setting of {rotate} calculates a temperature that includes only +rotational degrees of freedom. + +:line + [Output info:] This compute calculates a global scalar (the temperature) and a global diff --git a/doc/compute_temp_sphere.html b/doc/compute_temp_sphere.html index 31e73a05f5..23e18d16b5 100644 --- a/doc/compute_temp_sphere.html +++ b/doc/compute_temp_sphere.html @@ -13,16 +13,29 @@

    Syntax:

    -
    compute ID group-ID temp/sphere bias-ID 
    +
    compute ID group-ID temp/sphere keyword value ... 
     
    -
    • ID, group-ID are documented in compute command -
    • temp/sphere = style name of this compute command -
    • bias-ID = ID of a temperature compute that removes a velocity bias (optional) +
      • ID, group-ID are documented in compute command + +
      • temp/sphere = style name of this compute command + +
      • zero or more keyword/value pairs may be appended + +
      • keyword = bias or dof + +
          bias value = bias-IDuniform or gaussian
        +    bias-ID = ID of a temperature compute that removes a velocity bias
        +  dof value = all or rotate
        +    all = compute temperature of translational and rotational degrees of freedom
        +    rotate = compute temperature of just rotational degrees of freedom 
        +
        +

      Examples:

      compute 1 all temp/sphere
      -compute myTemp mobile temp/sphere tempCOM 
      +compute myTemp mobile temp/sphere bias tempCOM
      +compute myTemp mobile temp/sphere dof rotate 
       

      Description:

      @@ -66,15 +79,6 @@ the vector are ordered xx, yy, zz, xy, xz, yz. constant for the duration of the run; use the dynamic option of the compute_modify command if this is not the case.

      -

      If a bias-ID is specified it must be the ID of a temperature compute -that removes a "bias" velocity from each atom. This allows compute -temp/sphere to compute its thermal temperature after the translational -kinetic energy components have been altered in a prescribed way, -e.g. to remove a velocity profile. Thermostats that use this compute -will work with this bias term. See the doc pages for individual -computes that calculate a temperature and the doc pages for fixes that -perform thermostatting for more details. -

      This compute subtracts out translational degrees-of-freedom due to fixes that constrain molecular motion, such as fix shake and fix rigid. This means the @@ -87,6 +91,26 @@ be altered using the extra option of the discussion of different ways to compute temperature and perform thermostatting.

      +
      + +

      The keyword/value option pairs are used in the following ways. +

      +

      For the bias keyword, bias-ID refers to the ID of a temperature +compute that removes a "bias" velocity from each atom. This allows +compute temp/sphere to compute its thermal temperature after the +translational kinetic energy components have been altered in a +prescribed way, e.g. to remove a velocity profile. Thermostats that +use this compute will work with this bias term. See the doc pages for +individual computes that calculate a temperature and the doc pages for +fixes that perform thermostatting for more details. +

      +

      For the dof keyword, a setting of all calculates a temperature +that includes both translational and rotational degrees of freedom. A +setting of rotate calculates a temperature that includes only +rotational degrees of freedom. +

      +
      +

      Output info:

      This compute calculates a global scalar (the temperature) and a global @@ -116,6 +140,8 @@ particles with radius = 0.0.

      compute temp, compute temp/asphere

      -

      Default: none +

      Default: +

      +

      The option defaults are no bias and dof = all.

      diff --git a/doc/compute_temp_sphere.txt b/doc/compute_temp_sphere.txt index 874f50f364..16d1fcc761 100755 --- a/doc/compute_temp_sphere.txt +++ b/doc/compute_temp_sphere.txt @@ -10,16 +10,24 @@ compute temp/sphere command :h3 [Syntax:] -compute ID group-ID temp/sphere bias-ID :pre +compute ID group-ID temp/sphere keyword value ... :pre -ID, group-ID are documented in "compute"_compute.html command -temp/sphere = style name of this compute command -bias-ID = ID of a temperature compute that removes a velocity bias (optional) :ul +ID, group-ID are documented in "compute"_compute.html command :ulb,l +temp/sphere = style name of this compute command :l +zero or more keyword/value pairs may be appended :l +keyword = {bias} or {dof} :l + {bias} value = bias-ID{uniform} or {gaussian} + bias-ID = ID of a temperature compute that removes a velocity bias + {dof} value = {all} or {rotate} + all = compute temperature of translational and rotational degrees of freedom + rotate = compute temperature of just rotational degrees of freedom :pre +:ule [Examples:] compute 1 all temp/sphere -compute myTemp mobile temp/sphere tempCOM :pre +compute myTemp mobile temp/sphere bias tempCOM +compute myTemp mobile temp/sphere dof rotate :pre [Description:] @@ -63,15 +71,6 @@ The number of atoms contributing to the temperature is assumed to be constant for the duration of the run; use the {dynamic} option of the "compute_modify"_compute_modify.html command if this is not the case. -If a {bias-ID} is specified it must be the ID of a temperature compute -that removes a "bias" velocity from each atom. This allows compute -temp/sphere to compute its thermal temperature after the translational -kinetic energy components have been altered in a prescribed way, -e.g. to remove a velocity profile. Thermostats that use this compute -will work with this bias term. See the doc pages for individual -computes that calculate a temperature and the doc pages for fixes that -perform thermostatting for more details. - This compute subtracts out translational degrees-of-freedom due to fixes that constrain molecular motion, such as "fix shake"_fix_shake.html and "fix rigid"_fix_rigid.html. This means the @@ -84,6 +83,26 @@ See "this howto section"_Section_howto.html#4_16 of the manual for a discussion of different ways to compute temperature and perform thermostatting. +:line + +The keyword/value option pairs are used in the following ways. + +For the {bias} keyword, {bias-ID} refers to the ID of a temperature +compute that removes a "bias" velocity from each atom. This allows +compute temp/sphere to compute its thermal temperature after the +translational kinetic energy components have been altered in a +prescribed way, e.g. to remove a velocity profile. Thermostats that +use this compute will work with this bias term. See the doc pages for +individual computes that calculate a temperature and the doc pages for +fixes that perform thermostatting for more details. + +For the {dof} keyword, a setting of {all} calculates a temperature +that includes both translational and rotational degrees of freedom. A +setting of {rotate} calculates a temperature that includes only +rotational degrees of freedom. + +:line + [Output info:] This compute calculates a global scalar (the temperature) and a global @@ -113,4 +132,6 @@ particles with radius = 0.0. "compute temp"_compute_temp.html, "compute temp/asphere"_compute_temp.html -[Default:] none +[Default:] + +The option defaults are no bias and dof = all. diff --git a/doc/fix_langevin.html b/doc/fix_langevin.html index 07c16421e0..b304ac44fb 100644 --- a/doc/fix_langevin.html +++ b/doc/fix_langevin.html @@ -27,14 +27,21 @@
    • zero or more keyword/value pairs may be appended -
      keyword = scale or tally
      +
    • keyword = angmom or omega or scale or tally or zero + +
        angmom value = no or yes
      +    no = do not thermostat rotational degrees of freedom via the angular momentum
      +    yes = do thermostat rotational degrees of freedom via the angular momentum
      +  omega value = no or yes
      +    no = do not thermostat rotational degrees of freedom via then angular velocity
      +    yes = do thermostat rotational degrees of freedom via the angular velocity
         scale values = type ratio
           type = atom type (1-N)
           ratio = factor by which to scale the damping coefficient
      -  tally values = no or yes
      +  tally value = no or yes
           no = do not tally the energy added/subtracted to atoms
           yes = do tally the energy added/subtracted to atoms
      -  zero values = no or yes
      +  zero value = no or yes
           no = do not set total random force to zero
           yes = set total random force to zero 
       
      @@ -135,6 +142,25 @@ generate its own unique seed and its own stream of random numbers. Thus the dynamics of the system will not be identical on two runs on different numbers of processors.

      +
      + +

      The keyword/value option pairs are used in the following ways. +

      +

      The keyword angmom and omega keywords enable thermostatting of +rotational degrees of freedom in addition to the usual translational +degrees of freedom. This can only be done for finite-size particles. +A simulation using atom_style sphere defines an omega for finite-size +spheres. A simulation using atom_style ellipsoid defines a finite +size and shape for aspherical particles and an angular momentum. The +Langevin formulas for thermostatting the rotational degrees of freedom +are the same as those above, where force is replaced by torque, m is +replaced by the moment of inertia I, and v is replaced by omega (which +is derived from the angular momentum in the case of aspherical +particles). The rotational temperature of the particles can be +monitored by the compute temp/sphere and +compute temp/asphere commands with their +rotate options. +

      The keyword scale allows the damp factor to be scaled up or down by the specified factor for atoms of that type. This can be useful when different atom types have different sizes or masses. It can be used @@ -166,6 +192,8 @@ to zero by subtracting off an equal part of it from each atom in the group. As a result, the center-of-mass of a system with zero initial momentum will not drift over time.

      +
      +

      Restart, fix_modify, output, run start/stop, minimize info:

      No information about this fix is written to binary restart @@ -209,8 +237,8 @@ dpd/tstat

      Default:

      -

      The option defaults are scale = 1.0 for all types, tally = no, zero = -no. +

      The option defaults are angmom = no, omega = no, scale = 1.0 for all +types, tally = no, zero = no.


      diff --git a/doc/fix_langevin.txt b/doc/fix_langevin.txt index 422889c9b1..1228b63265 100644 --- a/doc/fix_langevin.txt +++ b/doc/fix_langevin.txt @@ -18,14 +18,20 @@ Tstart,Tstop = desired temperature at start/end of run (temperature units) :l damp = damping parameter (time units) :l seed = random number seed to use for white noise (positive integer) :l zero or more keyword/value pairs may be appended :l -keyword = {scale} or {tally} +keyword = {angmom} or {omega} or {scale} or {tally} or {zero} :l + {angmom} value = {no} or {yes} + {no} = do not thermostat rotational degrees of freedom via the angular momentum + {yes} = do thermostat rotational degrees of freedom via the angular momentum + {omega} value = {no} or {yes} + {no} = do not thermostat rotational degrees of freedom via then angular velocity + {yes} = do thermostat rotational degrees of freedom via the angular velocity {scale} values = type ratio type = atom type (1-N) ratio = factor by which to scale the damping coefficient - {tally} values = {no} or {yes} + {tally} value = {no} or {yes} {no} = do not tally the energy added/subtracted to atoms {yes} = do tally the energy added/subtracted to atoms - {zero} values = {no} or {yes} + {zero} value = {no} or {yes} {no} = do not set total random force to zero {yes} = set total random force to zero :pre :ule @@ -125,6 +131,25 @@ generate its own unique seed and its own stream of random numbers. Thus the dynamics of the system will not be identical on two runs on different numbers of processors. +:line + +The keyword/value option pairs are used in the following ways. + +The keyword {angmom} and {omega} keywords enable thermostatting of +rotational degrees of freedom in addition to the usual translational +degrees of freedom. This can only be done for finite-size particles. +A simulation using atom_style sphere defines an omega for finite-size +spheres. A simulation using atom_style ellipsoid defines a finite +size and shape for aspherical particles and an angular momentum. The +Langevin formulas for thermostatting the rotational degrees of freedom +are the same as those above, where force is replaced by torque, m is +replaced by the moment of inertia I, and v is replaced by omega (which +is derived from the angular momentum in the case of aspherical +particles). The rotational temperature of the particles can be +monitored by the "compute temp/sphere"_compute_temp_sphere.html and +"compute temp/asphere"_compute_temp_asphere.html commands with their +rotate options. + The keyword {scale} allows the damp factor to be scaled up or down by the specified factor for atoms of that type. This can be useful when different atom types have different sizes or masses. It can be used @@ -156,6 +181,8 @@ to zero by subtracting off an equal part of it from each atom in the group. As a result, the center-of-mass of a system with zero initial momentum will not drift over time. +:line + [Restart, fix_modify, output, run start/stop, minimize info:] No information about this fix is written to "binary restart @@ -199,8 +226,8 @@ dpd/tstat"_pair_dpd.html [Default:] -The option defaults are scale = 1.0 for all types, tally = no, zero = -no. +The option defaults are angmom = no, omega = no, scale = 1.0 for all +types, tally = no, zero = no. :line diff --git a/doc/fix_rigid.html b/doc/fix_rigid.html index b7b057c857..9abf8effc8 100644 --- a/doc/fix_rigid.html +++ b/doc/fix_rigid.html @@ -33,9 +33,13 @@
    • zero or more keyword/value pairs may be appended -
    • keyword = temp or press or tparam or pparam or force or torque +
    • keyword = langevin or temp or tparam or force or torque -
        temp values = Tstart Tstop Tperiod
      +
        langevin values = Tstart Tstop Tperiod seed
      +    Tstart,Tstop = desired temperature at start/stop of run (temperature units)
      +    Tdamp = temperature damping parameter (time units)
      +    seed = random number seed to use for white noise (positive integer)
      +  temp values = Tstart Tstop Tdamp
           Tstart,Tstop = desired temperature at start/stop of run (temperature units)
           Tdamp = temperature damping parameter (time units)
         tparam values = Tchain Titer Torder
      @@ -54,7 +58,7 @@
       

      Examples:

      fix 1 clump rigid single
      -fix 1 clump rigid single force 1 off off on
      +fix 1 clump rigid single force 1 off off on langevin 1.0 1.0 1.0 428984
       fix 1 polychains rigid/nvt molecule temp 1.0 1.0 5.0
       fix 1 polychains rigid molecule force 1*5 off off off force 6*10 off off on
       fix 2 fluid rigid group 3 clump1 clump2 clump3 torque * off off off 
      @@ -200,19 +204,35 @@ multiple rigid fixes to be defined, but it is more expensive.
       


      -

      As stated above, the rigid and rigid/nve styles perform constant -NVE time integration. Thus the temp, press, and tparam keywords -cannot be used with these styles. +

      The keyword/value option pairs are used in the following ways.

      -

      The rigid/nvt style performs constant NVT time integration, using a -temperature it computes for the rigid bodies which includes their -translational and rotational motion. The temp keyword must be used -with this style. The desired temperature at each timestep is a ramped -value during the run from Tstart to Tstop. The Tdamp parameter -is specified in time units and determines how rapidly the temperature -is relaxed. For example, a value of 100.0 means to relax the -temperature in a timespan of (roughly) 100 time units (tau or fmsec or -psec - see the units command). +

      The langevin and temp and tparam keywords perform thermostatting +of the rigid bodies, altering both their translational and rotational +degrees of freedom. What is meant by "temperature" of a collection of +rigid bodies and how it can be monitored via the fix output is +discussed below. +

      +

      The langevin keyword applies a Langevin thermostat to the constant +NVE time integration performed by either the rigid or rigid/nve +styles. It cannot be used with the rigid/nvt style. The desired +temperature at each timestep is a ramped value during the run from +Tstart to Tstop. The Tdamp parameter is specified in time units +and determines how rapidly the temperature is relaxed. For example, a +value of 100.0 means to relax the temperature in a timespan of +(roughly) 100 time units (tau or fmsec or psec - see the +units command). The random # seed must be a positive +integer. The way the Langevin thermostatting operates is explained on +the fix langevin doc page. +

      +

      The temp and tparam keywords apply a Nose/Hoover thermostat to the +NVT time integration performed by the rigid/nvt style. They cannot +be used with the rigid or rigid/nve styles. The desired +temperature at each timestep is a ramped value during the run from +Tstart to Tstop. The Tdamp parameter is specified in time units +and determines how rapidly the temperature is relaxed. For example, a +value of 100.0 means to relax the temperature in a timespan of +(roughly) 100 time units (tau or fmsec or psec - see the +units command).

      Nose/Hoover chains are used in conjunction with this thermostat. The tparam keyword can optionally be used to change the chain settings @@ -222,18 +242,22 @@ oscillations in temperature that can occur in a simulation. As a rule of thumb, increasing the chain length should lead to smaller oscillations.

      -

      There are alternate ways to thermostat a system of rigid bodies. You -can use fix langevin to treat the system as -effectively immersed in an implicit solvent, e.g. a Brownian dynamics -model. For hybrid systems with both rigid bodies and solvent -particles, you can thermostat only the solvent particles that surround -one or more rigid bodies by appropriate choice of groups in the -compute and fix commands for temperature and thermostatting. The -solvent interactions with the rigid bodies should then effectively -thermostat the rigid body temperature as well. +

      IMPORTANT NOTE: There are alternate ways to thermostat a system of +rigid bodies. You can use fix langevin to treat +the individual particles in the rigid bodies as effectively immersed +in an implicit solvent, e.g. a Brownian dynamics model. For hybrid +systems with both rigid bodies and solvent particles, you can +thermostat only the solvent particles that surround one or more rigid +bodies by appropriate choice of groups in the compute and fix commands +for temperature and thermostatting. The solvent interactions with the +rigid bodies should then effectively thermostat the rigid body +temperature as well without use of the Langevin or Nose/Hoover options +associated with the fix rigid commands.


      +

      The keyword/value option pairs are used in the following ways. +

      If you use a temperature compute with a group that includes particles in rigid bodies, the degrees-of-freedom removed by each rigid body are accounted for in the temperature (and pressure) @@ -289,6 +313,18 @@ rigid/nvt fix to add the energy change induced by the thermostatting to the system's potential energy as part of thermodynamic output.

      +

      The rigid and rigid/nve fixes computes a global scalar which can be +accessed by various output commands. The +scalar value calculated by these fixes is "intensive". The scalar is +the current temperature of the collection of rigid bodies. This is +averaged over all rigid bodies and their translational and rotational +degrees of freedom. The translational energy of a rigid body is 1/2 m +v^2, where m = total mass of the body and v = the velocity of its +center of mass. The rotational energy of a rigid body is 1/2 I w^2, +where I = the moment of inertia tensor of the body and w = its angular +velocity. Degrees of freedom constrained by the force and torque +keywords are removed from this calculation. +

      The rigid/nvt fix computes a global scalar which can be accessed by various output commands. The scalar value calculated by the rigid/nvt fix is "extensive". The scalar is the diff --git a/doc/fix_rigid.txt b/doc/fix_rigid.txt index 1a1c7ee6d4..9130c881ed 100644 --- a/doc/fix_rigid.txt +++ b/doc/fix_rigid.txt @@ -24,8 +24,12 @@ bodystyle = {single} or {molecule} or {group} :l groupID1, groupID2, ... = list of N group IDs :pre zero or more keyword/value pairs may be appended :l -keyword = {temp} or {press} or {tparam} or {pparam} or {force} or {torque} :l - {temp} values = Tstart Tstop Tperiod +keyword = {langevin} or {temp} or {tparam} or {force} or {torque} :l + {langevin} values = Tstart Tstop Tperiod seed + Tstart,Tstop = desired temperature at start/stop of run (temperature units) + Tdamp = temperature damping parameter (time units) + seed = random number seed to use for white noise (positive integer) + {temp} values = Tstart Tstop Tdamp Tstart,Tstop = desired temperature at start/stop of run (temperature units) Tdamp = temperature damping parameter (time units) {tparam} values = Tchain Titer Torder @@ -43,7 +47,7 @@ keyword = {temp} or {press} or {tparam} or {pparam} or {force} or {torque} :l [Examples:] fix 1 clump rigid single -fix 1 clump rigid single force 1 off off on +fix 1 clump rigid single force 1 off off on langevin 1.0 1.0 1.0 428984 fix 1 polychains rigid/nvt molecule temp 1.0 1.0 5.0 fix 1 polychains rigid molecule force 1*5 off off off force 6*10 off off on fix 2 fluid rigid group 3 clump1 clump2 clump3 torque * off off off :pre @@ -189,19 +193,35 @@ multiple rigid fixes to be defined, but it is more expensive. :line -As stated above, the {rigid} and {rigid/nve} styles perform constant -NVE time integration. Thus the {temp}, {press}, and {tparam} keywords -cannot be used with these styles. +The keyword/value option pairs are used in the following ways. -The {rigid/nvt} style performs constant NVT time integration, using a -temperature it computes for the rigid bodies which includes their -translational and rotational motion. The {temp} keyword must be used -with this style. The desired temperature at each timestep is a ramped -value during the run from {Tstart} to {Tstop}. The {Tdamp} parameter -is specified in time units and determines how rapidly the temperature -is relaxed. For example, a value of 100.0 means to relax the -temperature in a timespan of (roughly) 100 time units (tau or fmsec or -psec - see the "units"_units.html command). +The {langevin} and {temp} and {tparam} keywords perform thermostatting +of the rigid bodies, altering both their translational and rotational +degrees of freedom. What is meant by "temperature" of a collection of +rigid bodies and how it can be monitored via the fix output is +discussed below. + +The {langevin} keyword applies a Langevin thermostat to the constant +NVE time integration performed by either the {rigid} or {rigid/nve} +styles. It cannot be used with the {rigid/nvt} style. The desired +temperature at each timestep is a ramped value during the run from +{Tstart} to {Tstop}. The {Tdamp} parameter is specified in time units +and determines how rapidly the temperature is relaxed. For example, a +value of 100.0 means to relax the temperature in a timespan of +(roughly) 100 time units (tau or fmsec or psec - see the +"units"_units.html command). The random # {seed} must be a positive +integer. The way the Langevin thermostatting operates is explained on +the "fix langevin"_fix_langevin.html doc page. + +The {temp} and {tparam} keywords apply a Nose/Hoover thermostat to the +NVT time integration performed by the {rigid/nvt} style. They cannot +be used with the {rigid} or {rigid/nve} styles. The desired +temperature at each timestep is a ramped value during the run from +{Tstart} to {Tstop}. The {Tdamp} parameter is specified in time units +and determines how rapidly the temperature is relaxed. For example, a +value of 100.0 means to relax the temperature in a timespan of +(roughly) 100 time units (tau or fmsec or psec - see the +"units"_units.html command). Nose/Hoover chains are used in conjunction with this thermostat. The {tparam} keyword can optionally be used to change the chain settings @@ -211,18 +231,22 @@ oscillations in temperature that can occur in a simulation. As a rule of thumb, increasing the chain length should lead to smaller oscillations. -There are alternate ways to thermostat a system of rigid bodies. You -can use "fix langevin"_fix_langevin.html to treat the system as -effectively immersed in an implicit solvent, e.g. a Brownian dynamics -model. For hybrid systems with both rigid bodies and solvent -particles, you can thermostat only the solvent particles that surround -one or more rigid bodies by appropriate choice of groups in the -compute and fix commands for temperature and thermostatting. The -solvent interactions with the rigid bodies should then effectively -thermostat the rigid body temperature as well. +IMPORTANT NOTE: There are alternate ways to thermostat a system of +rigid bodies. You can use "fix langevin"_fix_langevin.html to treat +the individual particles in the rigid bodies as effectively immersed +in an implicit solvent, e.g. a Brownian dynamics model. For hybrid +systems with both rigid bodies and solvent particles, you can +thermostat only the solvent particles that surround one or more rigid +bodies by appropriate choice of groups in the compute and fix commands +for temperature and thermostatting. The solvent interactions with the +rigid bodies should then effectively thermostat the rigid body +temperature as well without use of the Langevin or Nose/Hoover options +associated with the fix rigid commands. :line +The keyword/value option pairs are used in the following ways. + If you use a "temperature compute"_compute.html with a group that includes particles in rigid bodies, the degrees-of-freedom removed by each rigid body are accounted for in the temperature (and pressure) @@ -278,6 +302,18 @@ rigid/nvt fix to add the energy change induced by the thermostatting to the system's potential energy as part of "thermodynamic output"_thermo_style.html. +The rigid and rigid/nve fixes computes a global scalar which can be +accessed by various "output commands"_Section_howto.html#4_15. The +scalar value calculated by these fixes is "intensive". The scalar is +the current temperature of the collection of rigid bodies. This is +averaged over all rigid bodies and their translational and rotational +degrees of freedom. The translational energy of a rigid body is 1/2 m +v^2, where m = total mass of the body and v = the velocity of its +center of mass. The rotational energy of a rigid body is 1/2 I w^2, +where I = the moment of inertia tensor of the body and w = its angular +velocity. Degrees of freedom constrained by the {force} and {torque} +keywords are removed from this calculation. + The rigid/nvt fix computes a global scalar which can be accessed by various "output commands"_Section_howto.html#4_15. The scalar value calculated by the rigid/nvt fix is "extensive". The scalar is the From a990ae69be42fc5acb30c93cdcb3c391f19ae5fc Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 29 Apr 2011 16:28:42 +0000 Subject: [PATCH 06/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6035 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/fix_langevin.cpp | 26 +++++++++++++------------- src/fix_rigid_nve.cpp | 18 +++++++++++------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/fix_langevin.cpp b/src/fix_langevin.cpp index 0d233126ef..2f4308e386 100644 --- a/src/fix_langevin.cpp +++ b/src/fix_langevin.cpp @@ -82,7 +82,19 @@ FixLangevin::FixLangevin(LAMMPS *lmp, int narg, char **arg) : int iarg = 7; while (iarg < narg) { - if (strcmp(arg[iarg],"scale") == 0) { + if (strcmp(arg[iarg],"angmom") == 0) { + if (iarg+2 > narg) error->all("Illegal fix langevin command"); + if (strcmp(arg[iarg+1],"no") == 0) aflag = 0; + else if (strcmp(arg[iarg+1],"yes") == 0) aflag = 1; + else error->all("Illegal fix langevin command"); + iarg += 2; + } else if (strcmp(arg[iarg],"omega") == 0) { + if (iarg+2 > narg) error->all("Illegal fix langevin command"); + if (strcmp(arg[iarg+1],"no") == 0) oflag = 0; + else if (strcmp(arg[iarg+1],"yes") == 0) oflag = 1; + else error->all("Illegal fix langevin command"); + iarg += 2; + } else if (strcmp(arg[iarg],"scale") == 0) { if (iarg+3 > narg) error->all("Illegal fix langevin command"); int itype = atoi(arg[iarg+1]); double scale = atof(arg[iarg+2]); @@ -102,18 +114,6 @@ FixLangevin::FixLangevin(LAMMPS *lmp, int narg, char **arg) : else if (strcmp(arg[iarg+1],"yes") == 0) zeroflag = 1; else error->all("Illegal fix langevin command"); iarg += 2; - } else if (strcmp(arg[iarg],"omega") == 0) { - if (iarg+2 > narg) error->all("Illegal fix langevin command"); - if (strcmp(arg[iarg+1],"no") == 0) oflag = 0; - else if (strcmp(arg[iarg+1],"yes") == 0) oflag = 1; - else error->all("Illegal fix langevin command"); - iarg += 2; - } else if (strcmp(arg[iarg],"angmom") == 0) { - if (iarg+2 > narg) error->all("Illegal fix langevin command"); - if (strcmp(arg[iarg+1],"no") == 0) aflag = 0; - else if (strcmp(arg[iarg+1],"yes") == 0) aflag = 1; - else error->all("Illegal fix langevin command"); - iarg += 2; } else error->all("Illegal fix langevin command"); } diff --git a/src/fix_rigid_nve.cpp b/src/fix_rigid_nve.cpp index ccd908e8f8..abdb258a75 100644 --- a/src/fix_rigid_nve.cpp +++ b/src/fix_rigid_nve.cpp @@ -223,16 +223,20 @@ void FixRigidNVE::final_integrate() MPI_Allreduce(sum[0],all[0],6*nbody,MPI_DOUBLE,MPI_SUM,world); + // update vcm and angmom + // include Langevin thermostat forces + // fflag,tflag = 0 for some dimensions in 2d + double mbody[3],tbody[3],fquat[4]; double dtf2 = dtf * 2.0; - + for (ibody = 0; ibody < nbody; ibody++) { - fcm[ibody][0] = all[ibody][0]; - fcm[ibody][1] = all[ibody][1]; - fcm[ibody][2] = all[ibody][2]; - torque[ibody][0] = all[ibody][3]; - torque[ibody][1] = all[ibody][4]; - torque[ibody][2] = all[ibody][5]; + fcm[ibody][0] = all[ibody][0] + langextra[ibody][0]; + fcm[ibody][1] = all[ibody][1] + langextra[ibody][1]; + fcm[ibody][2] = all[ibody][2] + langextra[ibody][2]; + torque[ibody][0] = all[ibody][3] + langextra[ibody][3]; + torque[ibody][1] = all[ibody][4] + langextra[ibody][4]; + torque[ibody][2] = all[ibody][5] + langextra[ibody][5]; // update vcm by 1/2 step From f702d5c26a92cffeed89a868560d8d19b43b75a2 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 29 Apr 2011 16:32:19 +0000 Subject: [PATCH 07/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6036 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index 66a97ccc65..b7cd4f016f 100644 --- a/src/version.h +++ b/src/version.h @@ -1 +1 @@ -#define LAMMPS_VERSION "27 Apr 2011" +#define LAMMPS_VERSION "29 Apr 2011" From 3b3c1d118dfe1aaf36d7307158cd2a6390c27b8d Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 29 Apr 2011 19:48:13 +0000 Subject: [PATCH 08/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6040 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/fix_rigid.cpp | 10 ++-------- src/math_extra.cpp | 32 ++++++++++++++++++++++++++++---- src/math_extra.h | 14 ++++++++++++++ 3 files changed, 44 insertions(+), 12 deletions(-) diff --git a/src/fix_rigid.cpp b/src/fix_rigid.cpp index 6d43b9949e..b24b87b41c 100644 --- a/src/fix_rigid.cpp +++ b/src/fix_rigid.cpp @@ -679,10 +679,7 @@ void FixRigid::init() for (i = 0; i < nlocal; i++) { if (body[i] < 0) continue; ibody = body[i]; - - itype = type[i]; - if (rmass) massone = rmass[i]; - else massone = mass[itype]; + massone = rmass[i]; if (eflags[i] & INERTIA_SPHERE) { sum[ibody][0] += 0.4 * massone * radius[i]*radius[i]; @@ -842,10 +839,7 @@ void FixRigid::init() for (i = 0; i < nlocal; i++) { if (body[i] < 0) continue; ibody = body[i]; - - itype = type[i]; - if (rmass) massone = rmass[i]; - else massone = mass[itype]; + massone = rmass[i]; if (eflags[i] & INERTIA_SPHERE) { sum[ibody][0] += 0.4 * massone * radius[i]*radius[i]; diff --git a/src/math_extra.cpp b/src/math_extra.cpp index 5160262aff..32acc8ed08 100644 --- a/src/math_extra.cpp +++ b/src/math_extra.cpp @@ -177,7 +177,7 @@ void rotate(double matrix[3][3], int i, int j, int k, int l, /* ---------------------------------------------------------------------- Richardson iteration to update quaternion from angular momentum return new normalized quaternion q - also returns + also returns updated omega at 1/2 step ------------------------------------------------------------------------- */ void richardson(double *q, double *m, double *w, double *moments, double dtq) @@ -506,9 +506,9 @@ void inertia_triangle(double *v0, double *v1, double *v2, double v[3][3],sv[3][3],vtsv[3][3]; double vvv[3],v1mv0[3],v2mv0[3],normal[3]; - v[0][0] = v0[0]; v[0][1] = v0[2]; v[0][2] = v0[3]; - v[1][0] = v1[0]; v[1][1] = v1[2]; v[1][2] = v1[3]; - v[2][0] = v2[0]; v[2][1] = v2[2]; v[2][2] = v2[3]; + v[0][0] = v0[0]; v[0][1] = v0[1]; v[0][2] = v0[2]; + v[1][0] = v1[0]; v[1][1] = v1[1]; v[1][2] = v1[2]; + v[2][0] = v2[0]; v[2][1] = v2[1]; v[2][2] = v2[2]; times3(s,v,sv); transpose_times3(v,sv,vtsv); @@ -533,6 +533,30 @@ void inertia_triangle(double *v0, double *v1, double *v2, inertia[5] = -inv24*a*vtsv[0][1]; } +/* ---------------------------------------------------------------------- + compute space-frame inertia tensor of a triangle + idiag = previously computed diagonal inertia tensor + quat = orientiation quaternion of triangle + return symmetric inertia tensor as 6-vector in Voigt notation +------------------------------------------------------------------------- */ + +void inertia_triangle(double *idiag, double *quat, double mass, + double *inertia) +{ + double p[3][3],ptrans[3][3],itemp[3][3],tensor[3][3]; + + quat_to_mat(quat,p); + quat_to_mat_trans(quat,ptrans); + diag_times3(idiag,ptrans,itemp); + times3(p,itemp,tensor); + inertia[0] = tensor[0][0]; + inertia[1] = tensor[1][1]; + inertia[2] = tensor[2][2]; + inertia[3] = tensor[1][2]; + inertia[4] = tensor[0][2]; + inertia[5] = tensor[0][1]; +} + /* ---------------------------------------------------------------------- */ } diff --git a/src/math_extra.h b/src/math_extra.h index 44af2e9a8a..1e05c5d728 100755 --- a/src/math_extra.h +++ b/src/math_extra.h @@ -31,6 +31,7 @@ namespace MathExtra { inline void normalize3(const double *v, double *ans); inline void snormalize3(const double, const double *v, double *ans); inline void negate3(double *v); + inline void scale3(double s, double *v); inline void add3(const double *v1, const double *v2, double *ans); inline void sub3(const double *v1, const double *v2, double *ans); inline double len3(const double *v); @@ -119,6 +120,8 @@ namespace MathExtra { double *inertia); void inertia_triangle(double *v0, double *v1, double *v2, double mass, double *inertia); + void inertia_triangle(double *idiag, double *quat, double mass, + double *inertia); } /* ---------------------------------------------------------------------- @@ -168,6 +171,17 @@ void MathExtra::negate3(double *v) v[2] = -v[2]; } +/* ---------------------------------------------------------------------- + scale vector v by s +------------------------------------------------------------------------- */ + +void MathExtra::scale3(double s, double *v) +{ + v[0] *= s; + v[1] *= s; + v[2] *= s; +} + /* ---------------------------------------------------------------------- ans = v1 + v2 ------------------------------------------------------------------------- */ From ee769613d766262e5bf437164385269bb9e0468d Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 29 Apr 2011 21:05:35 +0000 Subject: [PATCH 09/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6041 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/ASPHERE/compute_temp_asphere.cpp | 2 +- src/ASPHERE/fix_nve_asphere.cpp | 2 +- src/compute_erotate_sphere.cpp | 2 +- src/compute_temp_sphere.cpp | 2 +- src/fix_langevin.cpp | 4 ++-- src/fix_nh_sphere.cpp | 2 +- src/fix_nve_sphere.cpp | 2 +- src/memory.h | 6 +++--- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/ASPHERE/compute_temp_asphere.cpp b/src/ASPHERE/compute_temp_asphere.cpp index f2d34ac72a..b4fb8c79f8 100755 --- a/src/ASPHERE/compute_temp_asphere.cpp +++ b/src/ASPHERE/compute_temp_asphere.cpp @@ -34,7 +34,7 @@ using namespace LAMMPS_NS; enum{ROTATE,ALL}; -#define INERTIA 0.2 // moment of inertia for ellipsoid +#define INERTIA 0.2 // moment of inertia prefactor for ellipsoid /* ---------------------------------------------------------------------- */ diff --git a/src/ASPHERE/fix_nve_asphere.cpp b/src/ASPHERE/fix_nve_asphere.cpp index 9e4155581f..e078d2fb75 100755 --- a/src/ASPHERE/fix_nve_asphere.cpp +++ b/src/ASPHERE/fix_nve_asphere.cpp @@ -29,7 +29,7 @@ using namespace LAMMPS_NS; -#define INERTIA 0.2 // moment of inertia for ellipsoid +#define INERTIA 0.2 // moment of inertia prefactor for ellipsoid /* ---------------------------------------------------------------------- */ diff --git a/src/compute_erotate_sphere.cpp b/src/compute_erotate_sphere.cpp index b357501a06..1aa5ad8d99 100644 --- a/src/compute_erotate_sphere.cpp +++ b/src/compute_erotate_sphere.cpp @@ -23,7 +23,7 @@ using namespace LAMMPS_NS; -#define INERTIA 0.4 // moment of inertia for sphere +#define INERTIA 0.4 // moment of inertia prefactor for sphere /* ---------------------------------------------------------------------- */ diff --git a/src/compute_temp_sphere.cpp b/src/compute_temp_sphere.cpp index 93c9ec74aa..246d58bae4 100644 --- a/src/compute_temp_sphere.cpp +++ b/src/compute_temp_sphere.cpp @@ -28,7 +28,7 @@ using namespace LAMMPS_NS; enum{ROTATE,ALL}; -#define INERTIA 0.4 // moment of inertia for sphere +#define INERTIA 0.4 // moment of inertia prefactor for sphere /* ---------------------------------------------------------------------- */ diff --git a/src/fix_langevin.cpp b/src/fix_langevin.cpp index 2f4308e386..5e40a7c28b 100644 --- a/src/fix_langevin.cpp +++ b/src/fix_langevin.cpp @@ -40,8 +40,8 @@ using namespace LAMMPS_NS; enum{NOBIAS,BIAS}; -#define SINERTIA 0.4 // moment of inertia for sphere -#define EINERTIA 0.2 // moment of inertia for ellipsoid +#define SINERTIA 0.4 // moment of inertia prefactor for sphere +#define EINERTIA 0.2 // moment of inertia prefactor for ellipsoid /* ---------------------------------------------------------------------- */ diff --git a/src/fix_nh_sphere.cpp b/src/fix_nh_sphere.cpp index d1be4184fd..9f39466552 100644 --- a/src/fix_nh_sphere.cpp +++ b/src/fix_nh_sphere.cpp @@ -24,7 +24,7 @@ using namespace LAMMPS_NS; -#define INERTIA 0.4 // moment of inertia for sphere +#define INERTIA 0.4 // moment of inertia prefactor for sphere /* ---------------------------------------------------------------------- */ diff --git a/src/fix_nve_sphere.cpp b/src/fix_nve_sphere.cpp index 02968e1f13..fc67023f4a 100644 --- a/src/fix_nve_sphere.cpp +++ b/src/fix_nve_sphere.cpp @@ -24,7 +24,7 @@ using namespace LAMMPS_NS; -#define INERTIA 0.4 // moment of inertia for sphere +#define INERTIA 0.4 // moment of inertia prefactor for sphere enum{NONE,DIPOLE}; diff --git a/src/memory.h b/src/memory.h index 64901536c2..47abb49443 100644 --- a/src/memory.h +++ b/src/memory.h @@ -45,7 +45,7 @@ class Memory : protected Pointers { bigint nbytes = sizeof(TYPE) * n; array = (TYPE *) smalloc(nbytes,name); return array; - }; + } template TYPE **create(TYPE **&array, int n, const char *name) {fail(name);} @@ -62,7 +62,7 @@ class Memory : protected Pointers { bigint nbytes = sizeof(TYPE) * n; array = (TYPE *) srealloc(array,nbytes,name); return array; - }; + } template TYPE **grow(TYPE **&array, int n, const char *name) {fail(name);} @@ -75,7 +75,7 @@ class Memory : protected Pointers { void destroy(TYPE *array) { sfree(array); - }; + } /* ---------------------------------------------------------------------- create a 1d array with index from nlo to nhi inclusive From 21de80701082fc091af03eefd37883711c690c91 Mon Sep 17 00:00:00 2001 From: athomps Date: Fri, 29 Apr 2011 23:38:34 +0000 Subject: [PATCH 10/21] Added xsu, ysu, zsu to dump custom and dump cfg git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6043 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- doc/Section_start.txt | 29 +++++++++++++++++++++++++++-- doc/dump.html | 26 +++++++++++++++++++++----- doc/dump.txt | 26 +++++++++++++++++++++----- 3 files changed, 69 insertions(+), 12 deletions(-) diff --git a/doc/Section_start.txt b/doc/Section_start.txt index 4b4d96693f..fc45cf8cda 100644 --- a/doc/Section_start.txt +++ b/doc/Section_start.txt @@ -782,7 +782,9 @@ one-letter abbreviation can be used: -in or -i -log or -l -screen or -s --var or -v :ul +-var or -v +-plog or -pl +-pscreen or -ps :ul For example, lmp_ibm might be launched as follows: @@ -846,6 +848,7 @@ logfile is named "file" and each partition also logs information to a file.N. For both one-partition and multi-partition mode, if the specified file is "none", then no log files are created. Using a "log"_log.html command in the input script will override this setting. +Option -plog will override the name of the partition log files file.N. -screen file :pre @@ -859,7 +862,8 @@ the partition ID. If the switch is specified in multi-partition mode, the hi-level screen dump is named "file" and each partition also writes screen information to a file.N. For both one-partition and multi-partition mode, if the specified file is "none", then no screen -output is performed. +output is performed. Option -pscreen will override the name of the +partition screen files file.N. -var name value1 value2 ... :pre @@ -878,6 +882,27 @@ defining index and other kinds of variables and "this section"_Section_commands.html#3_2 for more info on using variables in input scripts. +-plog file:pre +Specify the base name for the partition log files, +so partition N writes log information to file.N. If file is +none, then no partition log files are created. +This overrides the +filename specified in the -log command-line option +If this option is not used +the log file for partition N is log.lammps.N or whatever is specified by +the -log command-line option. + +-pscreen file:pre +specify the base name for the +partition screen file, so partition N writes +screen information to file.N. If file is +none, then no partition screen files are created. +This overrides the +filename specified in the -screen command-line option. +If this option is not used +the screen file for partition N is screen.N or whatever is specified by +the -screen command-line option. + :line 2.7 LAMMPS screen output :h4,link(2_7) diff --git a/doc/dump.html b/doc/dump.html index 418f4996b5..f95117f5bc 100644 --- a/doc/dump.html +++ b/doc/dump.html @@ -47,7 +47,8 @@

        custom args = list of atom attributes
           possible attributes = id, mol, type, mass,
      -			  x, y, z, xs, ys, zs, xu, yu, zu, ix, iy, iz,
      +			  x, y, z, xs, ys, zs, xu, yu, zu, 
      +			  xsu, ysu, zsu, ix, iy, iz,
       			  vx, vy, vz, fx, fy, fz,
                                 q, mux, muy, muz, mu,
                                 radius, omegax, omegay, omegaz,
      @@ -62,6 +63,7 @@
             x,y,z = unscaled atom coordinates
             xs,ys,zs = scaled atom coordinates
             xu,yu,zu = unwrapped atom coordinates
      +      xsu,ysu,zsu = scaled unwrapped atom coordinates
             ix,iy,iz = box image that the atom is in
             vx,vy,vz = atom velocities
             fx,fy,fz = forces on atoms
      @@ -228,14 +230,23 @@ extended CFG format files, as used by the
       package.  Since the extended CFG format uses a single snapshot of the
       system per file, a wildcard "*" must be included in the filename, as
       discussed below.  The list of atom attributes for style cfg must
      -begin with "id type xs ys zs", since these quantities are needed to
      +begin with either "id type xs ys zs" or "id type xsu ysu zsu" or 
      +since these quantities are needed to
       write the CFG files in the appropriate format (though the "id" and
       "type" fields do not appear explicitly in the file).  Any remaining
       attributes will be stored as "auxiliary properties" in the CFG files.
       Note that you will typically want to use the dump_modify
       element command with CFG-formatted files, to
       associate element names with atom types, so that AtomEye can render
      -atoms appropriately.
      +atoms appropriately. When unwrapped coordinates xsu, ysu, and zsu
      +are requested, the nominal AtomEye periodic cell dimensions are expanded 
      +by a large factor UNWRAPEXPAND = 10.0, which ensures atoms that are 
      +displayed correctly for up to UNWRAPEXPAND/2 periodic boundary crossings 
      +in any direction. 
      +Beyond this, AtomEye will rewrap the unwrapped coordinates. 
      +The expansion causes the atoms to be drawn farther
      +away from the viewer, but it is easy to zoom the atoms closer, and
      +the interatomic distances are unaffected.   
       

      The dcd style writes DCD files, a standard atomic trajectory format used by the CHARMM, NAMD, and XPlor molecular dynamics packages. DCD @@ -391,7 +402,7 @@ of atom velocity and force and atomic charge. y, z attributes write atom coordinates "unscaled", in the appropriate distance units (Angstroms, sigma, etc). Use xs, ys, zs if you want the coordinates "scaled" to the box size, -so that each value is 0.0 to 1.0. If the simluation box is triclinic +so that each value is 0.0 to 1.0. If the simulation box is triclinic (tilted), then all atom coords will still be between 0.0 and 1.0. Use xu, yu, zu if you want the coordinates "unwrapped" by the image flags for each atom. Unwrapped means that if the atom has passed thru @@ -399,7 +410,12 @@ a periodic boundary one or more times, the value is printed for what the coordinate would be if it had not been wrapped back into the periodic box. Note that using xu, yu, zu means that the coordinate values may be far outside the box bounds printed with the -snapshot. The image flags can be printed directly using the ix, +snapshot. Using xsu, ysu, zsu is similar to using xu, yu, zu, +except that the unwrapped coordinates are scaled by the box size. Atoms +that have passed through a periodic boundary will have the corresponding +cooordinate increased or decreased by 1.0. +

      +

      The image flags can be printed directly using the ix, iy, iz attributes. The dump_modify command describes in more detail what is meant by scaled vs unscaled coordinates and the image flags. diff --git a/doc/dump.txt b/doc/dump.txt index 749b1b9109..11d0a9d730 100644 --- a/doc/dump.txt +++ b/doc/dump.txt @@ -37,7 +37,8 @@ args = list of arguments for a particular style :l {custom} args = list of atom attributes possible attributes = id, mol, type, mass, - x, y, z, xs, ys, zs, xu, yu, zu, ix, iy, iz, + x, y, z, xs, ys, zs, xu, yu, zu, + xsu, ysu, zsu, ix, iy, iz, vx, vy, vz, fx, fy, fz, q, mux, muy, muz, mu, radius, omegax, omegay, omegaz, @@ -52,6 +53,7 @@ args = list of arguments for a particular style :l x,y,z = unscaled atom coordinates xs,ys,zs = scaled atom coordinates xu,yu,zu = unwrapped atom coordinates + xsu,ysu,zsu = scaled unwrapped atom coordinates ix,iy,iz = box image that the atom is in vx,vy,vz = atom velocities fx,fy,fz = forces on atoms @@ -217,14 +219,23 @@ extended CFG format files, as used by the package. Since the extended CFG format uses a single snapshot of the system per file, a wildcard "*" must be included in the filename, as discussed below. The list of atom attributes for style {cfg} must -begin with "id type xs ys zs", since these quantities are needed to +begin with either "id type xs ys zs" or "id type xsu ysu zsu" or +since these quantities are needed to write the CFG files in the appropriate format (though the "id" and "type" fields do not appear explicitly in the file). Any remaining attributes will be stored as "auxiliary properties" in the CFG files. Note that you will typically want to use the "dump_modify element"_dump_modify.html command with CFG-formatted files, to associate element names with atom types, so that AtomEye can render -atoms appropriately. +atoms appropriately. When unwrapped coordinates {xsu}, {ysu}, and {zsu} +are requested, the nominal AtomEye periodic cell dimensions are expanded +by a large factor UNWRAPEXPAND = 10.0, which ensures atoms that are +displayed correctly for up to UNWRAPEXPAND/2 periodic boundary crossings +in any direction. +Beyond this, AtomEye will rewrap the unwrapped coordinates. +The expansion causes the atoms to be drawn farther +away from the viewer, but it is easy to zoom the atoms closer, and +the interatomic distances are unaffected. The {dcd} style writes DCD files, a standard atomic trajectory format used by the CHARMM, NAMD, and XPlor molecular dynamics packages. DCD @@ -380,7 +391,7 @@ There are several options for outputting atom coordinates. The {x}, {y}, {z} attributes write atom coordinates "unscaled", in the appropriate distance "units"_units.html (Angstroms, sigma, etc). Use {xs}, {ys}, {zs} if you want the coordinates "scaled" to the box size, -so that each value is 0.0 to 1.0. If the simluation box is triclinic +so that each value is 0.0 to 1.0. If the simulation box is triclinic (tilted), then all atom coords will still be between 0.0 and 1.0. Use {xu}, {yu}, {zu} if you want the coordinates "unwrapped" by the image flags for each atom. Unwrapped means that if the atom has passed thru @@ -388,7 +399,12 @@ a periodic boundary one or more times, the value is printed for what the coordinate would be if it had not been wrapped back into the periodic box. Note that using {xu}, {yu}, {zu} means that the coordinate values may be far outside the box bounds printed with the -snapshot. The image flags can be printed directly using the {ix}, +snapshot. Using {xsu}, {ysu}, {zsu} is similar to using {xu}, {yu}, {zu}, +except that the unwrapped coordinates are scaled by the box size. Atoms +that have passed through a periodic boundary will have the corresponding +cooordinate increased or decreased by 1.0. + +The image flags can be printed directly using the {ix}, {iy}, {iz} attributes. The "dump_modify"_dump_modify.html command describes in more detail what is meant by scaled vs unscaled coordinates and the image flags. From 7c04f95ce0f73b7dcd80b5d82bd256834205e85c Mon Sep 17 00:00:00 2001 From: athomps Date: Fri, 29 Apr 2011 23:40:14 +0000 Subject: [PATCH 11/21] Added xsu, ysu, zsu to dump custom and dump cfg git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6044 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/dump_cfg.cpp | 59 ++++++++++--- src/dump_cfg.h | 1 + src/dump_custom.cpp | 207 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 254 insertions(+), 13 deletions(-) diff --git a/src/dump_cfg.cpp b/src/dump_cfg.cpp index 471b5d3bb0..de7790bb89 100755 --- a/src/dump_cfg.cpp +++ b/src/dump_cfg.cpp @@ -30,6 +30,8 @@ #include "memory.h" #include "error.h" +#define UNWRAPEXPAND 10.0 + using namespace LAMMPS_NS; enum{INT,DOUBLE}; // same as in dump_custom.cpp @@ -41,10 +43,20 @@ DumpCFG::DumpCFG(LAMMPS *lmp, int narg, char **arg) : { if (narg < 10 || strcmp(arg[5],"id") != 0 || strcmp(arg[6],"type") != 0 || - strcmp(arg[7],"xs") != 0 || strcmp(arg[8],"ys") != 0 || - strcmp(arg[9],"zs") != 0) - error->all("Dump cfg arguments must start with 'id type xs ys zs'"); + (strcmp(arg[7],"xs") != 0 && strcmp(arg[7],"xsu") != 0) || + (strcmp(arg[8],"ys") != 0 && strcmp(arg[8],"ysu") != 0) || + (strcmp(arg[9],"zs") != 0 && strcmp(arg[9],"zsu") != 0) + ) + error->all("Dump cfg arguments must start with 'id type xs ys zs' or 'id type xsu ysu zsu'"); + if (strcmp(arg[7],"xs") == 0) + if (strcmp(arg[8],"ysu") == 0 || strcmp(arg[9],"zsu") == 0) + error->all("Dump cfg arguments can not mix xs|ys|zs with xsu|ysu|zsu"); + else unwrapflag = 0; + else if (strcmp(arg[8],"ys") == 0 || strcmp(arg[9],"zs") == 0) + error->all("Dump cfg arguments can not mix xs|ys|zs with xsu|ysu|zsu"); + else unwrapflag = 1; + ntypes = atom->ntypes; typenames = NULL; @@ -189,7 +201,9 @@ void DumpCFG::write_header(bigint n) // special handling for atom style peri // use average volume of particles to scale particles to mimic C atoms // scale box dimension to sc lattice for C with sigma = 1.44 Angstroms - + + // Special handling for unwrapped coordinates + double scale; if (atom->peri_flag) { int nlocal = atom->nlocal; @@ -199,9 +213,9 @@ void DumpCFG::write_header(bigint n) MPI_Allreduce(&vone,&vave,1,MPI_DOUBLE,MPI_SUM,world); if (atom->natoms) vave /= atom->natoms; if (vave > 0.0) scale = 1.44 / pow(vave,1.0/3.0); - else scale = 1.0; - } else scale = 1.0; - + } else if (unwrapflag == 1) scale = UNWRAPEXPAND; + else scale = 1.0; + if (me == 0 || multiproc) { char str[64]; sprintf(str,"Number of particles = %s\n",BIGINT_FORMAT); @@ -261,6 +275,8 @@ void DumpCFG::write_data(int n, double *mybuf) // write data lines in rbuf to file after transfer is done + double unwrap_coord; + if (nlines == nchosen) { for (itype = 1; itype <= ntypes; itype++) { for (i = 0; i < nchosen; i++) @@ -271,11 +287,30 @@ void DumpCFG::write_data(int n, double *mybuf) fprintf(fp,"%s\n",typenames[itype]); for (; i < nchosen; i++) { if (rbuf[i][1] == itype) { - for (j = 2; j < size_one; j++) { - if (vtype[j] == INT) - fprintf(fp,vformat[j],static_cast (rbuf[i][j])); - else fprintf(fp,vformat[j],rbuf[i][j]); - } + if (unwrapflag == 0) + for (j = 2; j < size_one; j++) { + if (vtype[j] == INT) + fprintf(fp,vformat[j],static_cast (rbuf[i][j])); + else fprintf(fp,vformat[j],rbuf[i][j]); + } + else + + // Unwrapped scaled coordinates are shifted to + // center of expanded box, to prevent + // rewrapping by AtomEye. Dividing by + // expansion factor restores correct + // interatomic distances. + + for (j = 2; j < 5; j++) { + unwrap_coord = (rbuf[i][j] - 0.5)/UNWRAPEXPAND + 0.5; + fprintf(fp,vformat[j],unwrap_coord); + } + for (j = 5; j < size_one; j++) { + if (vtype[j] == INT) + fprintf(fp,vformat[j],static_cast (rbuf[i][j])); + else fprintf(fp,vformat[j],rbuf[i][j]); + } + fprintf(fp,"\n"); } } diff --git a/src/dump_cfg.h b/src/dump_cfg.h index 0467983f76..3ddfc8fa44 100755 --- a/src/dump_cfg.h +++ b/src/dump_cfg.h @@ -36,6 +36,7 @@ class DumpCFG : public DumpCustom { int nchosen; // # of lines to be written on a writing proc int nlines; // # of lines transferred from buf to rbuf double **rbuf; // buf of data lines for data lines rearrangement + int unwrapflag; // 1 if unwrapped coordinates are requested void init_style(); void write_header(bigint); diff --git a/src/dump_custom.cpp b/src/dump_custom.cpp index 823c5c0dbf..b0dcc982a2 100644 --- a/src/dump_custom.cpp +++ b/src/dump_custom.cpp @@ -35,7 +35,9 @@ using namespace LAMMPS_NS; // same list as in compute_property.cpp, also customize that command enum{ID,MOL,TYPE,MASS, - X,Y,Z,XS,YS,ZS,XSTRI,YSTRI,ZSTRI,XU,YU,ZU,XUTRI,YUTRI,ZUTRI,IX,IY,IZ, + X,Y,Z,XS,YS,ZS,XSTRI,YSTRI,ZSTRI,XU,YU,ZU,XUTRI,YUTRI,ZUTRI, + XSU,YSU,ZSU,XSUTRI,YSUTRI,ZSUTRI, + IX,IY,IZ, VX,VY,VZ,FX,FY,FZ, Q,MUX,MUY,MUZ,MU,RADIUS,OMEGAX,OMEGAY,OMEGAZ,ANGMOMX,ANGMOMY,ANGMOMZ, TQX,TQY,TQZ,SPIN,ERADIUS,ERVEL,ERFORCE, @@ -563,6 +565,70 @@ int DumpCustom::count() ptr = dchoose; nstride = 1; + } else if (thresh_array[ithresh] == XSU) { + double **x = atom->x; + int *image = atom->image; + double boxxlo = domain->boxlo[0]; + double invxprd = 1.0/domain->xprd; + for (i = 0; i < nlocal; i++) + dchoose[i] = (x[i][0] - boxxlo) * invxprd + (image[i] & 1023) - 512; + ptr = dchoose; + nstride = 1; + + } else if (thresh_array[ithresh] == YSU) { + double **x = atom->x; + int *image = atom->image; + double boxylo = domain->boxlo[1]; + double invyprd = 1.0/domain->yprd; + for (i = 0; i < nlocal; i++) + dchoose[i] = (x[i][1] - boxylo) * invyprd + (image[i] >> 10 & 1023) - 512; + ptr = dchoose; + nstride = 1; + + } else if (thresh_array[ithresh] == ZSU) { + double **x = atom->x; + int *image = atom->image; + double boxzlo = domain->boxlo[2]; + double invzprd = 1.0/domain->zprd; + for (i = 0; i < nlocal; i++) + dchoose[i] = (x[i][2] - boxzlo) * invzprd + (image[i] >> 20) - 512; + ptr = dchoose; + nstride = 1; + + } else if (thresh_array[ithresh] == XSUTRI) { + double **x = atom->x; + int *image = atom->image; + double *boxlo = domain->boxlo; + double *h_inv = domain->h_inv; + for (i = 0; i < nlocal; i++) + dchoose[i] = h_inv[0]*(x[i][0]-boxlo[0]) + + h_inv[5]*(x[i][1]-boxlo[1]) + + h_inv[4]*(x[i][2]-boxlo[2]) + + (image[i] & 1023) - 512; + ptr = dchoose; + nstride = 1; + } else if (thresh_array[ithresh] == YSUTRI) { + double **x = atom->x; + int *image = atom->image; + double *boxlo = domain->boxlo; + double *h_inv = domain->h_inv; + for (i = 0; i < nlocal; i++) + dchoose[i] = h_inv[1]*(x[i][1]-boxlo[1]) + + h_inv[3]*(x[i][2]-boxlo[2]) + + (image[i] >> 10 & 1023) - 512; + ptr = dchoose; + nstride = 1; + } else if (thresh_array[ithresh] == ZSUTRI) { + double **x = atom->x; + int *image = atom->image; + double *boxlo = domain->boxlo; + double *h_inv = domain->h_inv; + for (i = 0; i < nlocal; i++) + dchoose[i] = h_inv[2]*(x[i][2]-boxlo[2]) + + (image[i] >> 20) - 512; + ptr = dchoose; + nstride = 1; + } else if (thresh_array[ithresh] == IX) { int *image = atom->image; for (i = 0; i < nlocal; i++) @@ -879,6 +945,18 @@ void DumpCustom::parse_fields(int narg, char **arg) if (domain->triclinic) pack_choice[i] = &DumpCustom::pack_zu_triclinic; else pack_choice[i] = &DumpCustom::pack_zu; vtype[i] = DOUBLE; + } else if (strcmp(arg[iarg],"xsu") == 0) { + if (domain->triclinic) pack_choice[i] = &DumpCustom::pack_xsu_triclinic; + else pack_choice[i] = &DumpCustom::pack_xsu; + vtype[i] = DOUBLE; + } else if (strcmp(arg[iarg],"ysu") == 0) { + if (domain->triclinic) pack_choice[i] = &DumpCustom::pack_ysu_triclinic; + else pack_choice[i] = &DumpCustom::pack_ysu; + vtype[i] = DOUBLE; + } else if (strcmp(arg[iarg],"zsu") == 0) { + if (domain->triclinic) pack_choice[i] = &DumpCustom::pack_zsu_triclinic; + else pack_choice[i] = &DumpCustom::pack_zsu; + vtype[i] = DOUBLE; } else if (strcmp(arg[iarg],"ix") == 0) { pack_choice[i] = &DumpCustom::pack_ix; vtype[i] = INT; @@ -1254,6 +1332,19 @@ int DumpCustom::modify_param(int narg, char **arg) else if (strcmp(arg[1],"zu") == 0 && domain->triclinic == 1) thresh_array[nthresh] = ZUTRI; + else if (strcmp(arg[1],"xsu") == 0 && domain->triclinic == 0) + thresh_array[nthresh] = XSU; + else if (strcmp(arg[1],"xsu") == 0 && domain->triclinic == 1) + thresh_array[nthresh] = XSUTRI; + else if (strcmp(arg[1],"ysu") == 0 && domain->triclinic == 0) + thresh_array[nthresh] = YSU; + else if (strcmp(arg[1],"ysu") == 0 && domain->triclinic == 1) + thresh_array[nthresh] = YSUTRI; + else if (strcmp(arg[1],"zsu") == 0 && domain->triclinic == 0) + thresh_array[nthresh] = ZSU; + else if (strcmp(arg[1],"zsu") == 0 && domain->triclinic == 1) + thresh_array[nthresh] = ZSUTRI; + else if (strcmp(arg[1],"ix") == 0) thresh_array[nthresh] = IX; else if (strcmp(arg[1],"iy") == 0) thresh_array[nthresh] = IY; else if (strcmp(arg[1],"iz") == 0) thresh_array[nthresh] = IZ; @@ -1821,6 +1912,120 @@ void DumpCustom::pack_zu_triclinic(int n) /* ---------------------------------------------------------------------- */ +void DumpCustom::pack_xsu(int n) +{ + double **x = atom->x; + int *image = atom->image; + int nlocal = atom->nlocal; + + double boxxlo = domain->boxlo[0]; + double invxprd = 1.0/domain->xprd; + + for (int i = 0; i < nlocal; i++) + if (choose[i]) { + buf[n] = (x[i][0] - boxxlo) * invxprd + (image[i] & 1023) - 512; + n += size_one; + } +} + +/* ---------------------------------------------------------------------- */ + +void DumpCustom::pack_ysu(int n) +{ + double **x = atom->x; + int *image = atom->image; + int nlocal = atom->nlocal; + + double boxylo = domain->boxlo[1]; + double invyprd = 1.0/domain->yprd; + + for (int i = 0; i < nlocal; i++) + if (choose[i]) { + buf[n] = (x[i][1] - boxylo) * invyprd + (image[i] >> 10 & 1023) - 512; + n += size_one; + } +} + +/* ---------------------------------------------------------------------- */ + +void DumpCustom::pack_zsu(int n) +{ + double **x = atom->x; + int *image = atom->image; + int nlocal = atom->nlocal; + + double boxzlo = domain->boxlo[2]; + double invzprd = 1.0/domain->zprd; + + for (int i = 0; i < nlocal; i++) + if (choose[i]) { + buf[n] = (x[i][2] - boxzlo) * invzprd + (image[i] >> 20) - 512; + n += size_one; + } +} + +/* ---------------------------------------------------------------------- */ + +void DumpCustom::pack_xsu_triclinic(int n) +{ + double **x = atom->x; + int *image = atom->image; + int nlocal = atom->nlocal; + + double *boxlo = domain->boxlo; + double *h_inv = domain->h_inv; + + for (int i = 0; i < nlocal; i++) + if (choose[i]) { + buf[n] = h_inv[0]*(x[i][0]-boxlo[0]) + + h_inv[5]*(x[i][1]-boxlo[1]) + + h_inv[4]*(x[i][2]-boxlo[2]) + + (image[i] & 1023) - 512; + n += size_one; + } +} + +/* ---------------------------------------------------------------------- */ + +void DumpCustom::pack_ysu_triclinic(int n) +{ + double **x = atom->x; + int *image = atom->image; + int nlocal = atom->nlocal; + + double *boxlo = domain->boxlo; + double *h_inv = domain->h_inv; + + for (int i = 0; i < nlocal; i++) + if (choose[i]) { + buf[n] = h_inv[1]*(x[i][1]-boxlo[1]) + + h_inv[3]*(x[i][2]-boxlo[2]) + + (image[i] >> 10 & 1023) - 512; + n += size_one; + } +} + +/* ---------------------------------------------------------------------- */ + +void DumpCustom::pack_zsu_triclinic(int n) +{ + double **x = atom->x; + int *image = atom->image; + int nlocal = atom->nlocal; + + double *boxlo = domain->boxlo; + double *h_inv = domain->h_inv; + + for (int i = 0; i < nlocal; i++) + if (choose[i]) { + buf[n] = h_inv[2]*(x[i][2]-boxlo[2]) + + (image[i] >> 20) - 512; + n += size_one; + } +} + +/* ---------------------------------------------------------------------- */ + void DumpCustom::pack_ix(int n) { int *image = atom->image; From 18b365794d3328b568c07bea29a1afc81568a9cf Mon Sep 17 00:00:00 2001 From: athomps Date: Fri, 29 Apr 2011 23:40:29 +0000 Subject: [PATCH 12/21] Added xsu, ysu, zsu to dump custom and dump cfg git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6045 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/dump_custom.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/dump_custom.h b/src/dump_custom.h index c5c867de5a..b6a65b2822 100644 --- a/src/dump_custom.h +++ b/src/dump_custom.h @@ -120,6 +120,12 @@ class DumpCustom : public Dump { void pack_xu_triclinic(int); void pack_yu_triclinic(int); void pack_zu_triclinic(int); + void pack_xsu(int); + void pack_ysu(int); + void pack_zsu(int); + void pack_xsu_triclinic(int); + void pack_ysu_triclinic(int); + void pack_zsu_triclinic(int); void pack_ix(int); void pack_iy(int); void pack_iz(int); From fb012a60aff355d063d895eac7efabaa00f1db17 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 29 Apr 2011 23:41:40 +0000 Subject: [PATCH 13/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6046 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/fix_rigid.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fix_rigid.cpp b/src/fix_rigid.cpp index b24b87b41c..0d276cc348 100644 --- a/src/fix_rigid.cpp +++ b/src/fix_rigid.cpp @@ -532,8 +532,8 @@ void FixRigid::init() } // grow extended arrays and set extended flags for each particle - // dorientflag = 1 if any particles store dipole orientation - // qorientflag = 1 if any particles store quat orientation + // qorientflag = 1 if any particle stores quat orientation + // dorientflag = 1 if any particle stores dipole orientation if (extended) { if (atom->mu_flag) dorientflag = 1; From 1773dd293f39c8bf3e5adb57c8a4e7ad8a251403 Mon Sep 17 00:00:00 2001 From: athomps Date: Fri, 29 Apr 2011 23:44:34 +0000 Subject: [PATCH 14/21] reverted accidental change git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6048 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- doc/Section_start.txt | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/doc/Section_start.txt b/doc/Section_start.txt index fc45cf8cda..4b4d96693f 100644 --- a/doc/Section_start.txt +++ b/doc/Section_start.txt @@ -782,9 +782,7 @@ one-letter abbreviation can be used: -in or -i -log or -l -screen or -s --var or -v --plog or -pl --pscreen or -ps :ul +-var or -v :ul For example, lmp_ibm might be launched as follows: @@ -848,7 +846,6 @@ logfile is named "file" and each partition also logs information to a file.N. For both one-partition and multi-partition mode, if the specified file is "none", then no log files are created. Using a "log"_log.html command in the input script will override this setting. -Option -plog will override the name of the partition log files file.N. -screen file :pre @@ -862,8 +859,7 @@ the partition ID. If the switch is specified in multi-partition mode, the hi-level screen dump is named "file" and each partition also writes screen information to a file.N. For both one-partition and multi-partition mode, if the specified file is "none", then no screen -output is performed. Option -pscreen will override the name of the -partition screen files file.N. +output is performed. -var name value1 value2 ... :pre @@ -882,27 +878,6 @@ defining index and other kinds of variables and "this section"_Section_commands.html#3_2 for more info on using variables in input scripts. --plog file:pre -Specify the base name for the partition log files, -so partition N writes log information to file.N. If file is -none, then no partition log files are created. -This overrides the -filename specified in the -log command-line option -If this option is not used -the log file for partition N is log.lammps.N or whatever is specified by -the -log command-line option. - --pscreen file:pre -specify the base name for the -partition screen file, so partition N writes -screen information to file.N. If file is -none, then no partition screen files are created. -This overrides the -filename specified in the -screen command-line option. -If this option is not used -the screen file for partition N is screen.N or whatever is specified by -the -screen command-line option. - :line 2.7 LAMMPS screen output :h4,link(2_7) From f6151f67354d145b0f9d0da9cc76c66e15a98ea2 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 2 May 2011 15:01:49 +0000 Subject: [PATCH 15/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6051 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- doc/Section_commands.html | 16 ++-- doc/Section_commands.txt | 4 + doc/Section_errors.html | 29 ++++-- doc/Section_errors.txt | 29 ++++-- doc/Section_intro.html | 8 ++ doc/Section_intro.txt | 8 ++ doc/Section_start.html | 197 ++++++++++++++++++-------------------- doc/Section_start.txt | 197 ++++++++++++++++++-------------------- doc/fix_gpu.html | 18 ++-- doc/fix_gpu.txt | 18 ++-- doc/kspace_style.html | 37 ++++++- doc/kspace_style.txt | 37 ++++++- doc/pair_coeff.html | 2 + doc/pair_coeff.txt | 2 + doc/pair_lj_expand.html | 33 ++++++- doc/pair_lj_expand.txt | 31 +++++- doc/pair_morse.html | 34 ++++++- doc/pair_morse.txt | 31 +++++- doc/pair_style.html | 2 + doc/pair_style.txt | 2 + 20 files changed, 478 insertions(+), 257 deletions(-) diff --git a/doc/Section_commands.html b/doc/Section_commands.html index 70bc1e8857..5f996268de 100644 --- a/doc/Section_commands.html +++ b/doc/Section_commands.html @@ -399,12 +399,13 @@ potentials. Click on the style itself for a full description: lj/charmm/coul/long/gpulj/charmm/coul/long/optlj/class2lj/class2/coul/cut lj/class2/coul/longlj/cutlj/cut/gpulj/cut/opt lj/cut/coul/cutlj/cut/coul/cut/gpulj/cut/coul/debyelj/cut/coul/long -lj/cut/coul/long/gpulj/cut/coul/long/tip4plj/expandlj/gromacs -lj/gromacs/coul/gromacslj/smoothlj96/cutlj96/cut/gpu -lubricatemeammorsemorse/opt -peri/lpsperi/pmbreaxresquared -softswtabletersoff -tersoff/zblyukawayukawa/colloid +lj/cut/coul/long/gpulj/cut/coul/long/tip4plj/expandlj/expand/gpu +lj/gromacslj/gromacs/coul/gromacslj/smoothlj96/cut +lj96/cut/gpulubricatemeammorse +morse/gpumorse/optperi/lpsperi/pmb +reaxresquaredsoftsw +tabletersofftersoff/zblyukawa +yukawa/colloid

      These are pair styles contributed by users, which can be used if @@ -483,7 +484,8 @@ description: Kspace solvers. Click on the style itself for a full description:

      These are Kspace solvers contributed by users, which can be used if diff --git a/doc/Section_commands.txt b/doc/Section_commands.txt index 1a18b8b9b2..1c58401303 100644 --- a/doc/Section_commands.txt +++ b/doc/Section_commands.txt @@ -611,6 +611,7 @@ potentials. Click on the style itself for a full description: "lj/cut/coul/long/gpu"_pair_lj.html, "lj/cut/coul/long/tip4p"_pair_lj.html, "lj/expand"_pair_lj_expand.html, +"lj/expand/gpu"_pair_lj_expand.html, "lj/gromacs"_pair_gromacs.html, "lj/gromacs/coul/gromacs"_pair_gromacs.html, "lj/smooth"_pair_lj_smooth.html, @@ -619,6 +620,7 @@ potentials. Click on the style itself for a full description: "lubricate"_pair_lubricate.html, "meam"_pair_meam.html, "morse"_pair_morse.html, +"morse/gpu"_pair_morse.html, "morse/opt"_pair_morse.html, "peri/lps"_pair_peri.html, "peri/pmb"_pair_peri.html, @@ -728,6 +730,8 @@ Kspace solvers. Click on the style itself for a full description: "ewald"_kspace_style.html, "pppm"_kspace_style.html, +"pppm/gpu/single"_kspace_style.html, +"pppm/gpu/double"_kspace_style.html, "pppm/tip4p"_kspace_style.html :tb(c=4,ea=c,w=100) These are Kspace solvers contributed by users, which can be used if diff --git a/doc/Section_errors.html b/doc/Section_errors.html index 8cd9ed6e46..b90784e0c0 100644 --- a/doc/Section_errors.html +++ b/doc/Section_errors.html @@ -173,6 +173,10 @@ the bond topologies you have defined. neighbors for each atom. This likely means something is wrong with the bond topologies you have defined. +

      Accelerated style in input script but no fix gpu + +
      GPU acceleration requires fix gpu in the input script. +
      All angle coeffs are not set
      All angle coefficients must be set in the data file or by the @@ -1240,9 +1244,9 @@ non-periodic z dimension. unless you use the kspace_modify command to define a 2d slab with a non-periodic z dimension. -
      Cannot use pair hybrid with multiple GPU pair styles +
      Cannot use pair hybrid with GPU neighbor builds -
      Self-explanatory. +
      See documentation for fix gpu.
      Cannot use pair tail corrections with 2d simulations @@ -1843,7 +1847,7 @@ does not exist.
      Self-explanatory. -
      Could not find or initialize a specified accelerator device +
      Could not find/initialize a specified accelerator device
      Your GPU setup is invalid. @@ -2123,6 +2127,10 @@ model. used. Most likely, one or more atoms have been blown out of the simulation box to a great distance. +
      Double precision is not supported on this accelerator. + +
      In this case, you must compile the GPU library for single precision. +
      Dump cfg and fix not computed at compatible times
      The fix must produce per-atom quantities on timesteps that dump cfg @@ -2355,6 +2363,10 @@ smaller simulation or on more processors.
      Self-explanatory. +
      Fix gpu split must be positive for hybrid pair styles. + +
      See documentation for fix gpu. +
      Fix ID for compute atom/molecule does not exist
      Self-explanatory. @@ -3227,6 +3239,11 @@ this fix.
      This is the way the fix must be defined in your input script. +
      GPU library not compiled for this accelerator + +
      The GPU library was not built for your accelerator. Check the arch flag in +lib/gpu. +
      Gmask function in equal-style variable formula
      Gmask is per-atom operation. @@ -3509,7 +3526,7 @@ simulation box.
      Eigensolve for rigid body was not sufficiently accurate. -
      Insufficient memory on accelerator (or no fix gpu) +
      Insufficient memory on accelerator.
      Self-explanatory. @@ -4587,10 +4604,6 @@ contain the same atom.
      Any rigid body defined by the fix rigid command must contain 2 or more atoms. -
      Out of memory on GPGPU - -
      You are attempting to run with too many atoms on the GPU. -
      Out of range atoms - cannot compute PPPM
      One or more atoms are attempting to map their charge to a PPPM grid diff --git a/doc/Section_errors.txt b/doc/Section_errors.txt index d94e8a9be7..0e2b2e804b 100644 --- a/doc/Section_errors.txt +++ b/doc/Section_errors.txt @@ -170,6 +170,10 @@ An inconsistency was detected when computing the number of 1-4 neighbors for each atom. This likely means something is wrong with the bond topologies you have defined. :dd +{Accelerated style in input script but no fix gpu} :dt + +GPU acceleration requires fix gpu in the input script. :dd + {All angle coeffs are not set} :dt All angle coefficients must be set in the data file or by the @@ -1237,9 +1241,9 @@ For kspace style pppm, all 3 dimensions must have periodic boundaries unless you use the kspace_modify command to define a 2d slab with a non-periodic z dimension. :dd -{Cannot use pair hybrid with multiple GPU pair styles} :dt +{Cannot use pair hybrid with GPU neighbor builds} :dt -Self-explanatory. :dd +See documentation for fix gpu. :dd {Cannot use pair tail corrections with 2d simulations} :dt @@ -1840,7 +1844,7 @@ The compute ID for computing temperature does not exist. :dd Self-explanatory. :dd -{Could not find or initialize a specified accelerator device} :dt +{Could not find/initialize a specified accelerator device} :dt Your GPU setup is invalid. :dd @@ -2120,6 +2124,10 @@ The domain has become extremely large so that neighbor bins cannot be used. Most likely, one or more atoms have been blown out of the simulation box to a great distance. :dd +{Double precision is not supported on this accelerator.} :dt + +In this case, you must compile the GPU library for single precision. :dd + {Dump cfg and fix not computed at compatible times} :dt The fix must produce per-atom quantities on timesteps that dump cfg @@ -2352,6 +2360,10 @@ This is not allowed. Make your SRD bin size smaller. :dd Self-explanatory. :dd +{Fix gpu split must be positive for hybrid pair styles.} :dt + +See documentation for fix gpu. :dd + {Fix ID for compute atom/molecule does not exist} :dt Self-explanatory. :dd @@ -3224,6 +3236,11 @@ When using a "*" in the restart file name, no matching file was found. :dd This is the way the fix must be defined in your input script. :dd +{GPU library not compiled for this accelerator} :dt + +The GPU library was not built for your accelerator. Check the arch flag in +lib/gpu. :dd + {Gmask function in equal-style variable formula} :dt Gmask is per-atom operation. :dd @@ -3506,7 +3523,7 @@ Eigensolve for rigid body was not sufficiently accurate. :dd Eigensolve for rigid body was not sufficiently accurate. :dd -{Insufficient memory on accelerator (or no fix gpu)} :dt +{Insufficient memory on accelerator. } :dt Self-explanatory. :dd @@ -4584,10 +4601,6 @@ contain the same atom. :dd Any rigid body defined by the fix rigid command must contain 2 or more atoms. :dd -{Out of memory on GPGPU} :dt - -You are attempting to run with too many atoms on the GPU. :dd - {Out of range atoms - cannot compute PPPM} :dt One or more atoms are attempting to map their charge to a PPPM grid diff --git a/doc/Section_intro.html b/doc/Section_intro.html index f9b00bb689..bce1a9d718 100644 --- a/doc/Section_intro.html +++ b/doc/Section_intro.html @@ -505,6 +505,14 @@ the list.
      + + + + + + + + diff --git a/doc/Section_intro.txt b/doc/Section_intro.txt index e4c26c8aab..a8e46df996 100644 --- a/doc/Section_intro.txt +++ b/doc/Section_intro.txt @@ -490,6 +490,14 @@ the list. :link(sjp,http://www.sandia.gov/~sjplimp) +pppm GPU single and double : Mike Brown (ORNL) +pair_style lj/cut/expand : Inderaj Bains (NVIDIA) +temperature accelerated dynamics (TAD) : Aidan Thompson (Sandia) +pair reax/c and fix qeq/reax : Metin Aktulga (Purdue, now LBNL) +DREIDING force field, pair_style hbond/dreiding, etc : Tod Pascal (CalTech) +fix adapt and compute ti for thermodynamic integreation for free energies : Sai Jayaraman (Sandia) +pair born and pair gauss : Sai Jayaraman (Sandia) +stochastic rotation dynamics (SRD) via fix srd : Jemery Lechman (Sandia) and Pieter in 't Veld (BASF) ipp Perl script tool : Reese Jones (Sandia) eam_database and createatoms tools : Xiaowang Zhou (Sandia) electron force field (eFF) : Andres Jaramillo-Botero and Julius Su (Caltech) diff --git a/doc/Section_start.html b/doc/Section_start.html index a83aaa0ad5..08287e3377 100644 --- a/doc/Section_start.html +++ b/doc/Section_start.html @@ -994,143 +994,130 @@ processing units (GPUs). We plan to add more over time. Currently, they only support NVIDIA GPU cards. To use them you need to install certain NVIDIA CUDA software on your system:

      -
      • Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0 -
      • Go to http://www.nvidia.com/object/cuda_get.html -
      • Install a driver and toolkit appropriate for your system (SDK is not necessary) -
      • Follow the instructions in README in lammps/lib/gpu to build the library. -
      • Run lammps/lib/gpu/nvc_get_devices to list supported devices and properties +
        • Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0 Go +
        • to http://www.nvidia.com/object/cuda_get.html Install a driver and +
        • toolkit appropriate for your system (SDK is not necessary) Follow the +
        • instructions in README in lammps/lib/gpu to build the library. Run +
        • lammps/lib/gpu/nvc_get_devices to list supported devices and +
        • properties

        GPU configuration

        When using GPUs, you are restricted to one physical GPU per LAMMPS -process. Multiple processes can share a single GPU and in many cases it -will be more efficient to run with multiple processes per GPU. Any GPU -accelerated style requires that fix gpu be used in the -input script to select and initialize the GPUs. The format for the fix -is: +process. Multiple processes can share a single GPU and in many cases +it will be more efficient to run with multiple processes per GPU. Any +GPU accelerated style requires that fix gpu be used in +the input script to select and initialize the GPUs. The format for the +fix is:

        fix name all gpu mode first last split 
         

        where name is the name for the fix. The gpu fix must be the first -fix specified for a given run, otherwise the program will exit -with an error. The gpu fix will not have any effect on runs -that do not use GPU acceleration; there should be no problem -with specifying the fix first in any input script. +fix specified for a given run, otherwise the program will exit with an +error. The gpu fix will not have any effect on runs that do not use +GPU acceleration; there should be no problem with specifying the fix +first in any input script.

        -

        mode can be either "force" or "force/neigh". In the former, -neighbor list calculation is performed on the CPU using the -standard LAMMPS routines. In the latter, the neighbor list -calculation is performed on the GPU. The GPU neighbor list -can be used for better performance, however, it -should not be used with a triclinic box. +

        mode can be either "force" or "force/neigh". In the former, neighbor +list calculation is performed on the CPU using the standard LAMMPS +routines. In the latter, the neighbor list calculation is performed on +the GPU. The GPU neighbor list can be used for better performance, +however, it cannot not be used with a triclinic box or with +hybrid pair styles.

        -

        There are cases when it might be more efficient to select the CPU for neighbor -list builds. If a non-GPU enabled style requires a neighbor list, it will also -be built using CPU routines. Redundant CPU and GPU neighbor list calculations -will typically be less efficient. For hybrid pair -styles, GPU calculated neighbor lists might be less efficient because -no particles will be skipped in a given neighbor list. +

        There are cases when it might be more efficient to select the CPU for +neighbor list builds. If a non-GPU enabled style requires a neighbor +list, it will also be built using CPU routines. Redundant CPU and GPU +neighbor list calculations will typically be less efficient.

        -

        first is the ID (as reported by lammps/lib/gpu/nvc_get_devices) -of the first GPU that will be used on each node. last is the -ID of the last GPU that will be used on each node. If you have -only one GPU per node, first and last will typically both be -0. Selecting a non-sequential set of GPU IDs (e.g. 0,1,3) -is not currently supported. +

        first is the ID (as reported by lammps/lib/gpu/nvc_get_devices) of +the first GPU that will be used on each node. last is the ID of the +last GPU that will be used on each node. If you have only one GPU per +node, first and last will typically both be 0. Selecting a +non-sequential set of GPU IDs (e.g. 0,1,3) is not currently supported.

        -

        split is the fraction of particles whose forces, torques, -energies, and/or virials will be calculated on the GPU. This -can be used to perform CPU and GPU force calculations -simultaneously. If split is negative, the software will -attempt to calculate the optimal fraction automatically -every 25 timesteps based on CPU and GPU timings. Because the GPU speedups -are dependent on the number of particles, automatic calculation of the -split can be less efficient, but typically results in loop times -within 20% of an optimal fixed split. +

        split is the fraction of particles whose forces, torques, energies, +and/or virials will be calculated on the GPU. This can be used to +perform CPU and GPU force calculations simultaneously. If split is +negative, the software will attempt to calculate the optimal fraction +automatically every 25 timesteps based on CPU and GPU timings. Because +the GPU speedups are dependent on the number of particles, automatic +calculation of the split can be less efficient, but typically results +in loop times within 20% of an optimal fixed split.

        -

        If you have two GPUs per node, 8 CPU cores per node, and -would like to run on 4 nodes with dynamic balancing of -force calculation across CPU and GPU cores, the fix -might be +

        If you have two GPUs per node, 8 CPU cores per node, and would like to +run on 4 nodes with dynamic balancing of force calculation across CPU +and GPU cores, the fix might be

        fix 0 all gpu force/neigh 0 1 -1 
         
        -

        with LAMMPS run on 32 processes. In this case, all -CPU cores and GPU devices on the nodes would be utilized. -Each GPU device would be shared by 4 CPU cores. The -CPU cores would perform force calculations for some -fraction of the particles at the same time the GPUs -performed force calculation for the other particles. +

        with LAMMPS run on 32 processes. In this case, all CPU cores and GPU +devices on the nodes would be utilized. Each GPU device would be +shared by 4 CPU cores. The CPU cores would perform force calculations +for some fraction of the particles at the same time the GPUs performed +force calculation for the other particles.

        -

        Because of the large number of cores on each GPU -device, it might be more efficient to run on fewer -processes per GPU when the number of particles per process -is small (100's of particles); this can be necessary -to keep the GPU cores busy. +

        Because of the large number of cores on each GPU device, it might be +more efficient to run on fewer processes per GPU when the number of +particles per process is small (100's of particles); this can be +necessary to keep the GPU cores busy.

        GPU input script

        -

        In order to use GPU acceleration in LAMMPS, -fix_gpu -should be used in order to initialize and configure the -GPUs for use. Additionally, GPU enabled styles must be -selected in the input script. Currently, -this is limited to a few pair styles. -Some GPU-enabled styles have additional restrictions -listed in their documentation. +

        In order to use GPU acceleration in LAMMPS, fix_gpu +should be used in order to initialize and configure the GPUs for +use. Additionally, GPU enabled styles must be selected in the input +script. Currently, this is limited to a few pair +styles and PPPM. Some GPU-enabled styles have +additional restrictions listed in their documentation.

        GPU asynchronous pair computation

        -

        The GPU accelerated pair styles can be used to perform -pair style force calculation on the GPU while other -calculations are -performed on the CPU. One method to do this is to specify -a split in the gpu fix as described above. In this case, -force calculation for the pair style will also be performed -on the CPU. +

        The GPU accelerated pair styles can be used to perform pair style +force calculation on the GPU while other calculations are performed on +the CPU. One method to do this is to specify a split in the gpu fix +as described above. In this case, force calculation for the pair +style will also be performed on the CPU.

        -

        When the CPU work in a GPU pair style has finished, -the next force computation will begin, possibly before the -GPU has finished. If split is 1.0 in the gpu fix, the next -force computation will begin almost immediately. This can -be used to run a hybrid GPU pair style at -the same time as a hybrid CPU pair style. In this case, the -GPU pair style should be first in the hybrid command in order to -perform simultaneous calculations. This also -allows bond, angle, -dihedral, improper, -and long-range force -computations to be run simultaneously with the GPU pair style. -Once all CPU force computations have completed, the gpu fix -will block until the GPU has finished all work before continuing -the run. +

        When the CPU work in a GPU pair style has finished, the next force +computation will begin, possibly before the GPU has finished. If +split is 1.0 in the gpu fix, the next force computation will begin +almost immediately. This can be used to run a +hybrid GPU pair style at the same time as a hybrid +CPU pair style. In this case, the GPU pair style should be first in +the hybrid command in order to perform simultaneous calculations. This +also allows bond, angle, +dihedral, improper, and +long-range force computations to be run +simultaneously with the GPU pair style. Once all CPU force +computations have completed, the gpu fix will block until the GPU has +finished all work before continuing the run.

        GPU timing

        GPU accelerated pair styles can perform computations asynchronously -with CPU computations. The "Pair" time reported by LAMMPS -will be the maximum of the time required to complete the CPU -pair style computations and the time required to complete the GPU -pair style computations. Any time spent for GPU-enabled pair styles -for computations that run simultaneously with bond, -angle, dihedral, -improper, and long-range calculations -will not be included in the "Pair" time. +with CPU computations. The "Pair" time reported by LAMMPS will be the +maximum of the time required to complete the CPU pair style +computations and the time required to complete the GPU pair style +computations. Any time spent for GPU-enabled pair styles for +computations that run simultaneously with bond, +angle, dihedral, +improper, and long-range +calculations will not be included in the "Pair" time.

        -

        When mode for the gpu fix is force/neigh, -the time for neighbor list calculations on the GPU will be added -into the "Pair" time, not the "Neigh" time. A breakdown of the -times required for various tasks on the GPU (data copy, neighbor -calculations, force computations, etc.) are output only -with the LAMMPS screen output at the end of each run. These timings represent -total time spent on the GPU for each routine, regardless of asynchronous -CPU calculations. +

        When mode for the gpu fix is force/neigh, the time for neighbor list +calculations on the GPU will be added into the "Pair" time, not the +"Neigh" time. A breakdown of the times required for various tasks on +the GPU (data copy, neighbor calculations, force computations, etc.) +are output only with the LAMMPS screen output at the end of each +run. These timings represent total time spent on the GPU for each +routine, regardless of asynchronous CPU calculations.

        GPU single vs double precision

        -

        See the lammps/lib/gpu/README file for instructions on how to build -the LAMMPS gpu library for single, mixed, and double precision. The latter -requires that your GPU card supports double precision. +

        See the lammps/lib/gpu/README file for instructions on how to build +the LAMMPS gpu library for single, mixed, and double precision. The +latter requires that your GPU card supports double precision.


        diff --git a/doc/Section_start.txt b/doc/Section_start.txt index 4b4d96693f..fbdd015ab4 100644 --- a/doc/Section_start.txt +++ b/doc/Section_start.txt @@ -984,143 +984,130 @@ processing units (GPUs). We plan to add more over time. Currently, they only support NVIDIA GPU cards. To use them you need to install certain NVIDIA CUDA software on your system: -Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0 -Go to http://www.nvidia.com/object/cuda_get.html -Install a driver and toolkit appropriate for your system (SDK is not necessary) -Follow the instructions in README in lammps/lib/gpu to build the library. -Run lammps/lib/gpu/nvc_get_devices to list supported devices and properties :ul +Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0 Go +to http://www.nvidia.com/object/cuda_get.html Install a driver and +toolkit appropriate for your system (SDK is not necessary) Follow the +instructions in README in lammps/lib/gpu to build the library. Run +lammps/lib/gpu/nvc_get_devices to list supported devices and +properties :ul GPU configuration :h4 When using GPUs, you are restricted to one physical GPU per LAMMPS -process. Multiple processes can share a single GPU and in many cases it -will be more efficient to run with multiple processes per GPU. Any GPU -accelerated style requires that "fix gpu"_fix_gpu.html be used in the -input script to select and initialize the GPUs. The format for the fix -is: +process. Multiple processes can share a single GPU and in many cases +it will be more efficient to run with multiple processes per GPU. Any +GPU accelerated style requires that "fix gpu"_fix_gpu.html be used in +the input script to select and initialize the GPUs. The format for the +fix is: fix {name} all gpu {mode} {first} {last} {split} :pre where {name} is the name for the fix. The gpu fix must be the first -fix specified for a given run, otherwise the program will exit -with an error. The gpu fix will not have any effect on runs -that do not use GPU acceleration; there should be no problem -with specifying the fix first in any input script. +fix specified for a given run, otherwise the program will exit with an +error. The gpu fix will not have any effect on runs that do not use +GPU acceleration; there should be no problem with specifying the fix +first in any input script. -{mode} can be either "force" or "force/neigh". In the former, -neighbor list calculation is performed on the CPU using the -standard LAMMPS routines. In the latter, the neighbor list -calculation is performed on the GPU. The GPU neighbor list -can be used for better performance, however, it -should not be used with a triclinic box. +{mode} can be either "force" or "force/neigh". In the former, neighbor +list calculation is performed on the CPU using the standard LAMMPS +routines. In the latter, the neighbor list calculation is performed on +the GPU. The GPU neighbor list can be used for better performance, +however, it cannot not be used with a triclinic box or with +"hybrid"_pair_hybrid.html pair styles. -There are cases when it might be more efficient to select the CPU for neighbor -list builds. If a non-GPU enabled style requires a neighbor list, it will also -be built using CPU routines. Redundant CPU and GPU neighbor list calculations -will typically be less efficient. For "hybrid"_pair_hybrid.html pair -styles, GPU calculated neighbor lists might be less efficient because -no particles will be skipped in a given neighbor list. +There are cases when it might be more efficient to select the CPU for +neighbor list builds. If a non-GPU enabled style requires a neighbor +list, it will also be built using CPU routines. Redundant CPU and GPU +neighbor list calculations will typically be less efficient. -{first} is the ID (as reported by lammps/lib/gpu/nvc_get_devices) -of the first GPU that will be used on each node. {last} is the -ID of the last GPU that will be used on each node. If you have -only one GPU per node, {first} and {last} will typically both be -0. Selecting a non-sequential set of GPU IDs (e.g. 0,1,3) -is not currently supported. +{first} is the ID (as reported by lammps/lib/gpu/nvc_get_devices) of +the first GPU that will be used on each node. {last} is the ID of the +last GPU that will be used on each node. If you have only one GPU per +node, {first} and {last} will typically both be 0. Selecting a +non-sequential set of GPU IDs (e.g. 0,1,3) is not currently supported. -{split} is the fraction of particles whose forces, torques, -energies, and/or virials will be calculated on the GPU. This -can be used to perform CPU and GPU force calculations -simultaneously. If {split} is negative, the software will -attempt to calculate the optimal fraction automatically -every 25 timesteps based on CPU and GPU timings. Because the GPU speedups -are dependent on the number of particles, automatic calculation of the -split can be less efficient, but typically results in loop times -within 20% of an optimal fixed split. +{split} is the fraction of particles whose forces, torques, energies, +and/or virials will be calculated on the GPU. This can be used to +perform CPU and GPU force calculations simultaneously. If {split} is +negative, the software will attempt to calculate the optimal fraction +automatically every 25 timesteps based on CPU and GPU timings. Because +the GPU speedups are dependent on the number of particles, automatic +calculation of the split can be less efficient, but typically results +in loop times within 20% of an optimal fixed split. -If you have two GPUs per node, 8 CPU cores per node, and -would like to run on 4 nodes with dynamic balancing of -force calculation across CPU and GPU cores, the fix -might be +If you have two GPUs per node, 8 CPU cores per node, and would like to +run on 4 nodes with dynamic balancing of force calculation across CPU +and GPU cores, the fix might be fix 0 all gpu force/neigh 0 1 -1 :pre -with LAMMPS run on 32 processes. In this case, all -CPU cores and GPU devices on the nodes would be utilized. -Each GPU device would be shared by 4 CPU cores. The -CPU cores would perform force calculations for some -fraction of the particles at the same time the GPUs -performed force calculation for the other particles. +with LAMMPS run on 32 processes. In this case, all CPU cores and GPU +devices on the nodes would be utilized. Each GPU device would be +shared by 4 CPU cores. The CPU cores would perform force calculations +for some fraction of the particles at the same time the GPUs performed +force calculation for the other particles. -Because of the large number of cores on each GPU -device, it might be more efficient to run on fewer -processes per GPU when the number of particles per process -is small (100's of particles); this can be necessary -to keep the GPU cores busy. +Because of the large number of cores on each GPU device, it might be +more efficient to run on fewer processes per GPU when the number of +particles per process is small (100's of particles); this can be +necessary to keep the GPU cores busy. GPU input script :h4 -In order to use GPU acceleration in LAMMPS, -"fix_gpu"_fix_gpu.html -should be used in order to initialize and configure the -GPUs for use. Additionally, GPU enabled styles must be -selected in the input script. Currently, -this is limited to a few "pair styles"_pair_style.html. -Some GPU-enabled styles have additional restrictions -listed in their documentation. +In order to use GPU acceleration in LAMMPS, "fix_gpu"_fix_gpu.html +should be used in order to initialize and configure the GPUs for +use. Additionally, GPU enabled styles must be selected in the input +script. Currently, this is limited to a few "pair +styles"_pair_style.html and PPPM. Some GPU-enabled styles have +additional restrictions listed in their documentation. GPU asynchronous pair computation :h4 -The GPU accelerated pair styles can be used to perform -pair style force calculation on the GPU while other -calculations are -performed on the CPU. One method to do this is to specify -a {split} in the gpu fix as described above. In this case, -force calculation for the pair style will also be performed -on the CPU. +The GPU accelerated pair styles can be used to perform pair style +force calculation on the GPU while other calculations are performed on +the CPU. One method to do this is to specify a {split} in the gpu fix +as described above. In this case, force calculation for the pair +style will also be performed on the CPU. -When the CPU work in a GPU pair style has finished, -the next force computation will begin, possibly before the -GPU has finished. If {split} is 1.0 in the gpu fix, the next -force computation will begin almost immediately. This can -be used to run a "hybrid"_pair_hybrid.html GPU pair style at -the same time as a hybrid CPU pair style. In this case, the -GPU pair style should be first in the hybrid command in order to -perform simultaneous calculations. This also -allows "bond"_bond_style.html, "angle"_angle_style.html, -"dihedral"_dihedral_style.html, "improper"_improper_style.html, -and "long-range"_kspace_style.html force -computations to be run simultaneously with the GPU pair style. -Once all CPU force computations have completed, the gpu fix -will block until the GPU has finished all work before continuing -the run. +When the CPU work in a GPU pair style has finished, the next force +computation will begin, possibly before the GPU has finished. If +{split} is 1.0 in the gpu fix, the next force computation will begin +almost immediately. This can be used to run a +"hybrid"_pair_hybrid.html GPU pair style at the same time as a hybrid +CPU pair style. In this case, the GPU pair style should be first in +the hybrid command in order to perform simultaneous calculations. This +also allows "bond"_bond_style.html, "angle"_angle_style.html, +"dihedral"_dihedral_style.html, "improper"_improper_style.html, and +"long-range"_kspace_style.html force computations to be run +simultaneously with the GPU pair style. Once all CPU force +computations have completed, the gpu fix will block until the GPU has +finished all work before continuing the run. GPU timing :h4 GPU accelerated pair styles can perform computations asynchronously -with CPU computations. The "Pair" time reported by LAMMPS -will be the maximum of the time required to complete the CPU -pair style computations and the time required to complete the GPU -pair style computations. Any time spent for GPU-enabled pair styles -for computations that run simultaneously with "bond"_bond_style.html, -"angle"_angle_style.html, "dihedral"_dihedral_style.html, -"improper"_improper_style.html, and "long-range"_kspace_style.html calculations -will not be included in the "Pair" time. +with CPU computations. The "Pair" time reported by LAMMPS will be the +maximum of the time required to complete the CPU pair style +computations and the time required to complete the GPU pair style +computations. Any time spent for GPU-enabled pair styles for +computations that run simultaneously with "bond"_bond_style.html, +"angle"_angle_style.html, "dihedral"_dihedral_style.html, +"improper"_improper_style.html, and "long-range"_kspace_style.html +calculations will not be included in the "Pair" time. -When {mode} for the gpu fix is force/neigh, -the time for neighbor list calculations on the GPU will be added -into the "Pair" time, not the "Neigh" time. A breakdown of the -times required for various tasks on the GPU (data copy, neighbor -calculations, force computations, etc.) are output only -with the LAMMPS screen output at the end of each run. These timings represent -total time spent on the GPU for each routine, regardless of asynchronous -CPU calculations. +When {mode} for the gpu fix is force/neigh, the time for neighbor list +calculations on the GPU will be added into the "Pair" time, not the +"Neigh" time. A breakdown of the times required for various tasks on +the GPU (data copy, neighbor calculations, force computations, etc.) +are output only with the LAMMPS screen output at the end of each +run. These timings represent total time spent on the GPU for each +routine, regardless of asynchronous CPU calculations. GPU single vs double precision :h4 -See the lammps/lib/gpu/README file for instructions on how to build -the LAMMPS gpu library for single, mixed, and double precision. The latter -requires that your GPU card supports double precision. +See the lammps/lib/gpu/README file for instructions on how to build +the LAMMPS gpu library for single, mixed, and double precision. The +latter requires that your GPU card supports double precision. :line diff --git a/doc/fix_gpu.html b/doc/fix_gpu.html index 72839bc0d1..f71a8e8a4a 100644 --- a/doc/fix_gpu.html +++ b/doc/fix_gpu.html @@ -48,14 +48,13 @@ should not be any problems with specifying this fix first in input scripts.

        mode specifies where neighbor list calculations will be performed. If mode is force, neighbor list calculation is performed on the CPU. If mode is force/neigh, neighbor list calculation is -performed on the GPU. GPU neighbor -list calculation currently cannot be used with a triclinic box. +performed on the GPU. GPU neighbor list calculation currently cannot be +used with a triclinic box. GPU neighbor list calculation currently +cannot be used with hybrid pair styles. GPU neighbor lists are not compatible with styles that are not GPU-enabled. When a non-GPU enabled style requires a neighbor list, it will also be built using CPU routines. In these cases, it will typically be more efficient -to only use CPU neighbor list builds. For hybrid pair -styles, GPU calculated neighbor lists might be less efficient because -no particles will be skipped in a given neighbor list. +to only use CPU neighbor list builds.

        first and last specify the GPUs that will be used for simulation. On each node, the GPU IDs in the inclusive range from first to last will @@ -77,7 +76,8 @@ style.

        In order to use GPU acceleration, a GPU enabled style must be selected in the input script in addition to this fix. Currently, -this is limited to a few pair styles. +this is limited to a few pair styles and +the PPPM kspace style.

        More details about these settings and various possible hardware configuration are in this section of the @@ -95,8 +95,10 @@ the run command.

        Restrictions:

        The fix must be the first fix specified for a given run. The force/neigh -mode should not be used with a triclinic box or GPU-enabled pair styles -that need special_bonds settings. +mode should not be used with a triclinic box or hybrid +pair styles. +

        +

        split must be positive when using hybrid pair styles.

        Currently, group-ID must be all.

        diff --git a/doc/fix_gpu.txt b/doc/fix_gpu.txt index 88fa6f5414..df8fbadb8f 100644 --- a/doc/fix_gpu.txt +++ b/doc/fix_gpu.txt @@ -39,14 +39,13 @@ should not be any problems with specifying this fix first in input scripts. {mode} specifies where neighbor list calculations will be performed. If {mode} is force, neighbor list calculation is performed on the CPU. If {mode} is force/neigh, neighbor list calculation is -performed on the GPU. GPU neighbor -list calculation currently cannot be used with a triclinic box. +performed on the GPU. GPU neighbor list calculation currently cannot be +used with a triclinic box. GPU neighbor list calculation currently +cannot be used with "hybrid"_pair_hybrid.html pair styles. GPU neighbor lists are not compatible with styles that are not GPU-enabled. When a non-GPU enabled style requires a neighbor list, it will also be built using CPU routines. In these cases, it will typically be more efficient -to only use CPU neighbor list builds. For "hybrid"_pair_hybrid.html pair -styles, GPU calculated neighbor lists might be less efficient because -no particles will be skipped in a given neighbor list. +to only use CPU neighbor list builds. {first} and {last} specify the GPUs that will be used for simulation. On each node, the GPU IDs in the inclusive range from {first} to {last} will @@ -68,7 +67,8 @@ style. In order to use GPU acceleration, a GPU enabled style must be selected in the input script in addition to this fix. Currently, -this is limited to a few "pair styles"_pair_style.html. +this is limited to a few "pair styles"_pair_style.html and +the PPPM "kspace style"_kspace_style.html. More details about these settings and various possible hardware configuration are in "this section"_Section_start.html#2_8 of the @@ -86,8 +86,10 @@ the "run"_run.html command. [Restrictions:] The fix must be the first fix specified for a given run. The force/neigh -{mode} should not be used with a triclinic box or GPU-enabled pair styles -that need "special_bonds"_special_bonds.html settings. +{mode} should not be used with a triclinic box or "hybrid"_pair_hybrid.html +pair styles. + +{split} must be positive when using "hybrid"_pair_hybrid.html pair styles. Currently, group-ID must be all. diff --git a/doc/kspace_style.html b/doc/kspace_style.html index 57c035f570..30b0bcbc1b 100644 --- a/doc/kspace_style.html +++ b/doc/kspace_style.html @@ -15,7 +15,7 @@

        kspace_style style value 
         
        -
        • style = none or ewald or pppm or pppm/tip4p or ewald/n +
          • style = none or ewald or pppm or pppm/tip4p or ewald/n or pppm/gpu/single or pppm/gpu/double
              none value = none
               ewald value = precision
            @@ -25,6 +25,10 @@
               pppm/tip4p value = precision
                 precision = desired accuracy
               ewald/n value = precision
            +    precision = desired accuracy
            +  pppm/gpu/single value = precision
            +    precision = desired accuracy
            +  pppm/gpu/double value = precision
                 precision = desired accuracy 
             
            @@ -72,6 +76,11 @@ long-range potentials.

            Currently, only the ewald/n style can be used with non-orthogonal (triclinic symmetry) simulation boxes.

            +

            The pppm/gpu/single and pppm/gpu/double styles are GPU-enabled +version of pppm. See more details below. +

            +
            +

            When a kspace style is used, a pair style that includes the short-range correction to the pairwise Coulombic or other 1/r^N forces must also be selected. For Coulombic interactions, these styles are @@ -88,6 +97,27 @@ of K-space vectors for style ewald or the FFT grid size for style

            See the kspace_modify command for additional options of the K-space solvers that can be set.

            +
            + +

            The pppm/gpu/single style performs single precision +charge assignment and force interpolation calculations on the GPU. +The pppm/gpu/double style performs the mesh calculations on the GPU +in double precision. FFT solves are calculated on the CPU in both +cases. If either pppm/gpu/single or pppm/gpu/double are used with +a GPU-enabled pair style, part of the PPPM calculation can be performed +concurrently on the GPU while other calculations for non-bonded and +bonded force calculation are performed on the CPU. +

            +

            More details about GPU settings and various possible hardware +configurations are in this section of the +manual. +

            +

            Additional requirements in your input script to run with GPU-enabled +PPPM styles are as follows: +

            +

            fix gpu must be used. The fix controls +the essential GPU selection and initialization steps. +

            Restrictions:

            A simulation must be 3d and periodic in all dimensions to use an Ewald @@ -103,6 +133,11 @@ LAMMPS section for more info. enabled if LAMMPS was built with that package. See the Making LAMMPS section for more info.

            +

            The pppm/gpu/single and pppm/gpu/double styles are part of the +"gpu" package. They are only enabled if LAMMPS was built with that +package. See the Making LAMMPS section for +more info. +

            When using a long-range pairwise TIP4P potential, you must use kspace style pppm/tip4p and vice versa.

            diff --git a/doc/kspace_style.txt b/doc/kspace_style.txt index b6b12696d2..217978c193 100644 --- a/doc/kspace_style.txt +++ b/doc/kspace_style.txt @@ -12,7 +12,7 @@ kspace_style command :h3 kspace_style style value :pre -style = {none} or {ewald} or {pppm} or {pppm/tip4p} or {ewald/n} :ulb,l +style = {none} or {ewald} or {pppm} or {pppm/tip4p} or {ewald/n} or {pppm/gpu/single} or {pppm/gpu/double} :ulb,l {none} value = none {ewald} value = precision precision = desired accuracy @@ -21,6 +21,10 @@ style = {none} or {ewald} or {pppm} or {pppm/tip4p} or {ewald/n} :ulb,l {pppm/tip4p} value = precision precision = desired accuracy {ewald/n} value = precision + precision = desired accuracy + {pppm/gpu/single} value = precision + precision = desired accuracy + {pppm/gpu/double} value = precision precision = desired accuracy :pre :ule @@ -67,6 +71,11 @@ long-range potentials. Currently, only the {ewald/n} style can be used with non-orthogonal (triclinic symmetry) simulation boxes. +The {pppm/gpu/single} and {pppm/gpu/double} styles are GPU-enabled +version of {pppm}. See more details below. + +:line + When a kspace style is used, a pair style that includes the short-range correction to the pairwise Coulombic or other 1/r^N forces must also be selected. For Coulombic interactions, these styles are @@ -83,6 +92,27 @@ of K-space vectors for style {ewald} or the FFT grid size for style See the "kspace_modify"_kspace_modify.html command for additional options of the K-space solvers that can be set. +:line + +The {pppm/gpu/single} style performs single precision +charge assignment and force interpolation calculations on the GPU. +The {pppm/gpu/double} style performs the mesh calculations on the GPU +in double precision. FFT solves are calculated on the CPU in both +cases. If either {pppm/gpu/single} or {pppm/gpu/double} are used with +a GPU-enabled pair style, part of the PPPM calculation can be performed +concurrently on the GPU while other calculations for non-bonded and +bonded force calculation are performed on the CPU. + +More details about GPU settings and various possible hardware +configurations are in "this section"_Section_start.html#2_8 of the +manual. + +Additional requirements in your input script to run with GPU-enabled +PPPM styles are as follows: + +"fix gpu"_fix_gpu.html must be used. The fix controls +the essential GPU selection and initialization steps. + [Restrictions:] A simulation must be 3d and periodic in all dimensions to use an Ewald @@ -98,6 +128,11 @@ The {ewald/n} style is part of the "user-ewaldn" package. It is only enabled if LAMMPS was built with that package. See the "Making LAMMPS"_Section_start.html#2_3 section for more info. +The {pppm/gpu/single} and {pppm/gpu/double} styles are part of the +"gpu" package. They are only enabled if LAMMPS was built with that +package. See the "Making LAMMPS"_Section_start.html#2_3 section for +more info. + When using a long-range pairwise TIP4P potential, you must use kspace style {pppm/tip4p} and vice versa. diff --git a/doc/pair_coeff.html b/doc/pair_coeff.html index fa98d3addd..0f54432555 100644 --- a/doc/pair_coeff.html +++ b/doc/pair_coeff.html @@ -134,6 +134,7 @@ the pair_style command, and coefficients specified by the associated
          • pair_style lj/cut/coul/long/gpu - GPU-enabled version of LJ with long-range Coulomb
          • pair_style lj/cut/coul/long/tip4p - LJ with long-range Coulomb for TIP4P water
          • pair_style lj/expand - Lennard-Jones for variable size particles +
          • pair_style lj/expand/gpu - GPU-enabled version of lj/expand
          • pair_style lj/gromacs - GROMACS-style Lennard-Jones potential
          • pair_style lj/gromacs/coul/gromacs - GROMACS-style LJ and Coulombic potential
          • pair_style lj/smooth - smoothed Lennard-Jones potential @@ -142,6 +143,7 @@ the pair_style command, and coefficients specified by the associated
          • pair_style lubricate - hydrodynamic lubrication forces
          • pair_style meam - modified embedded atom method (MEAM)
          • pair_style morse - Morse potential +
          • pair_style morse/gpu - GPU-enabled version of Morse potential
          • pair_style morse/opt - optimized version of Morse potential
          • pair_style peri/lps - peridynamic LPS potential
          • pair_style peri/pmb - peridynamic PMB potential diff --git a/doc/pair_coeff.txt b/doc/pair_coeff.txt index baf95341db..308e35329c 100644 --- a/doc/pair_coeff.txt +++ b/doc/pair_coeff.txt @@ -131,6 +131,7 @@ the pair_style command, and coefficients specified by the associated "pair_style lj/cut/coul/long/gpu"_pair_lj.html - GPU-enabled version of LJ with long-range Coulomb "pair_style lj/cut/coul/long/tip4p"_pair_lj.html - LJ with long-range Coulomb for TIP4P water "pair_style lj/expand"_pair_lj_expand.html - Lennard-Jones for variable size particles +"pair_style lj/expand/gpu"_pair_lj_expand.html - GPU-enabled version of lj/expand "pair_style lj/gromacs"_pair_gromacs.html - GROMACS-style Lennard-Jones potential "pair_style lj/gromacs/coul/gromacs"_pair_gromacs.html - GROMACS-style LJ and Coulombic potential "pair_style lj/smooth"_pair_lj_smooth.html - smoothed Lennard-Jones potential @@ -139,6 +140,7 @@ the pair_style command, and coefficients specified by the associated "pair_style lubricate"_pair_lubricate.html - hydrodynamic lubrication forces "pair_style meam"_pair_meam.html - modified embedded atom method (MEAM) "pair_style morse"_pair_morse.html - Morse potential +"pair_style morse/gpu"_pair_morse.html - GPU-enabled version of Morse potential "pair_style morse/opt"_pair_morse.html - optimized version of Morse potential "pair_style peri/lps"_pair_peri.html - peridynamic LPS potential "pair_style peri/pmb"_pair_peri.html - peridynamic PMB potential diff --git a/doc/pair_lj_expand.html b/doc/pair_lj_expand.html index 8dfb3d2068..9e766d3f4b 100644 --- a/doc/pair_lj_expand.html +++ b/doc/pair_lj_expand.html @@ -11,10 +11,14 @@

            pair_style lj/expand command

            +

            pair_style lj/expand/gpu command +

            Syntax:

            pair_style lj/expand cutoff 
             
            +
            pair_style lj/expand/gpu cutoff 
            +
            • cutoff = global cutoff for lj/expand interactions (distance units)

            Examples: @@ -49,6 +53,29 @@ commands, or by mixing as described below:

            The delta values can be positive or negative. The last coefficient is optional. If not specified, the global LJ cutoff is used.

            +

            Style lj/expand/gpu is a GPU-enabled version of style lj/expand. +See more details below. +

            +
            + +

            The lj/expand/gpu style is identical to the lj/expand style, +except that each processor off-loads its pairwise calculations to a +GPU chip. Depending on the hardware available on your system this can provide a +speed-up. See the Running on GPUs section of +the manual for more details about hardware and software requirements +for using GPUs. +

            +

            More details about these settings and various possible hardware +configuration are in this section of the +manual. +

            +

            Additional requirements in your input script to run with GPU-enabled styles +are as follows: +

            +

            The newton pair setting must be off and +fix gpu must be used. The fix controls +the essential GPU selection and initialization steps. +


            Mixing, shift, table, tail correction, restart, rRESPA info: @@ -80,7 +107,11 @@ to be specified in an input script that reads a restart file.


            -

            Restrictions: none +

            Restrictions: +

            +

            The lj/expand/gpu style is part of the "gpu" package. It is only +enabled if LAMMPS was built with that package. See the Making +LAMMPS section for more info.

            Related commands:

            diff --git a/doc/pair_lj_expand.txt b/doc/pair_lj_expand.txt index 3c82f5b944..96487df87e 100644 --- a/doc/pair_lj_expand.txt +++ b/doc/pair_lj_expand.txt @@ -7,10 +7,12 @@ :line pair_style lj/expand command :h3 +pair_style lj/expand/gpu command :h3 [Syntax:] pair_style lj/expand cutoff :pre +pair_style lj/expand/gpu cutoff :pre cutoff = global cutoff for lj/expand interactions (distance units) :ul @@ -46,6 +48,29 @@ cutoff (distance units) :ul The delta values can be positive or negative. The last coefficient is optional. If not specified, the global LJ cutoff is used. +Style {lj/expand/gpu} is a GPU-enabled version of style {lj/expand}. +See more details below. + +:line + +The {lj/expand/gpu} style is identical to the {lj/expand} style, +except that each processor off-loads its pairwise calculations to a +GPU chip. Depending on the hardware available on your system this can provide a +speed-up. See the "Running on GPUs"_Section_start.html#2_8 section of +the manual for more details about hardware and software requirements +for using GPUs. + +More details about these settings and various possible hardware +configuration are in "this section"_Section_start.html#2_8 of the +manual. + +Additional requirements in your input script to run with GPU-enabled styles +are as follows: + +The "newton pair"_newton.html setting must be {off} and +"fix gpu"_fix_gpu.html must be used. The fix controls +the essential GPU selection and initialization steps. + :line [Mixing, shift, table, tail correction, restart, rRESPA info]: @@ -77,7 +102,11 @@ This pair style can only be used via the {pair} keyword of the :line -[Restrictions:] none +[Restrictions:] + +The {lj/expand/gpu} style is part of the "gpu" package. It is only +enabled if LAMMPS was built with that package. See the "Making +LAMMPS"_Section_start.html#2_3 section for more info. [Related commands:] diff --git a/doc/pair_morse.html b/doc/pair_morse.html index e5183ef53e..0f505c5d28 100644 --- a/doc/pair_morse.html +++ b/doc/pair_morse.html @@ -11,12 +11,18 @@

            pair_style morse command

            +

            pair_style morse/gpu command +

            pair_style morse/opt command

            Syntax:

            pair_style morse cutoff 
             
            +
            pair_style morse/gpu cutoff 
            +
            +
            pair_style morse/opt cutoff 
            +
            • cutoff = global cutoff for Morse interactions (distance units)

            Examples: @@ -53,6 +59,29 @@ give identical answers. Depending on system size and the processor you are running on, it may be 5-25% faster (for the pairwise portion of the run time).

            +

            Style morse/gpu is a GPU-enabled version of style morse. +See more details below. +

            +
            + +

            The morse/gpu style is identical to the morse style, +except that each processor off-loads its pairwise calculations to a +GPU chip. Depending on the hardware available on your system this can provide a +speed-up. See the Running on GPUs section of +the manual for more details about hardware and software requirements +for using GPUs. +

            +

            More details about these settings and various possible hardware +configuration are in this section of the +manual. +

            +

            Additional requirements in your input script to run with GPU-enabled styles +are as follows: +

            +

            The newton pair setting must be off and +fix gpu must be used. The fix controls +the essential GPU selection and initialization steps. +


            Mixing, shift, table, tail correction, restart, rRESPA info: @@ -82,8 +111,9 @@ to be specified in an input script that reads a restart file.

            Restrictions:

            -

            The morse/opt style is part of the "opt" package. It is only -enabled if LAMMPS was built with that package. See the Making +

            The morse/opt style is part of the "opt" package. The morse/gpu +style is part of the "gpu" package. They are only +enabled if LAMMPS was built with those packages. See the Making LAMMPS section for more info.

            Related commands: diff --git a/doc/pair_morse.txt b/doc/pair_morse.txt index 1c1799c242..8e23d84767 100644 --- a/doc/pair_morse.txt +++ b/doc/pair_morse.txt @@ -7,11 +7,14 @@ :line pair_style morse command :h3 +pair_style morse/gpu command :h3 pair_style morse/opt command :h3 [Syntax:] pair_style morse cutoff :pre +pair_style morse/gpu cutoff :pre +pair_style morse/opt cutoff :pre cutoff = global cutoff for Morse interactions (distance units) :ul @@ -49,6 +52,29 @@ give identical answers. Depending on system size and the processor you are running on, it may be 5-25% faster (for the pairwise portion of the run time). +Style {morse/gpu} is a GPU-enabled version of style {morse}. +See more details below. + +:line + +The {morse/gpu} style is identical to the {morse} style, +except that each processor off-loads its pairwise calculations to a +GPU chip. Depending on the hardware available on your system this can provide a +speed-up. See the "Running on GPUs"_Section_start.html#2_8 section of +the manual for more details about hardware and software requirements +for using GPUs. + +More details about these settings and various possible hardware +configuration are in "this section"_Section_start.html#2_8 of the +manual. + +Additional requirements in your input script to run with GPU-enabled styles +are as follows: + +The "newton pair"_newton.html setting must be {off} and +"fix gpu"_fix_gpu.html must be used. The fix controls +the essential GPU selection and initialization steps. + :line [Mixing, shift, table, tail correction, restart, rRESPA info]: @@ -78,8 +104,9 @@ These pair styles can only be used via the {pair} keyword of the [Restrictions:] -The {morse/opt} style is part of the "opt" package. It is only -enabled if LAMMPS was built with that package. See the "Making +The {morse/opt} style is part of the "opt" package. The {morse/gpu} +style is part of the "gpu" package. They are only +enabled if LAMMPS was built with those packages. See the "Making LAMMPS"_Section_start.html#2_3 section for more info. [Related commands:] diff --git a/doc/pair_style.html b/doc/pair_style.html index 450428a7bc..862a22d7cc 100644 --- a/doc/pair_style.html +++ b/doc/pair_style.html @@ -136,6 +136,7 @@ the pair_style command, and coefficients specified by the associated

          • pair_style lj/cut/coul/long/gpu - GPU-enabled version of LJ with long-range Coulomb
          • pair_style lj/cut/coul/long/tip4p - LJ with long-range Coulomb for TIP4P water
          • pair_style lj/expand - Lennard-Jones for variable size particles +
          • pair_style lj/expand/gpu - GPU-enabled version of lj/expand
          • pair_style lj/gromacs - GROMACS-style Lennard-Jones potential
          • pair_style lj/gromacs/coul/gromacs - GROMACS-style LJ and Coulombic potential
          • pair_style lj/smooth - smoothed Lennard-Jones potential @@ -144,6 +145,7 @@ the pair_style command, and coefficients specified by the associated
          • pair_style lubricate - hydrodynamic lubrication forces
          • pair_style meam - modified embedded atom method (MEAM)
          • pair_style morse - Morse potential +
          • pair_style morse/gpu - GPU-enabled version of Morse potential
          • pair_style morse/opt - optimized version of Morse potential
          • pair_style peri/lps - peridynamic LPS potential
          • pair_style peri/pmb - peridynamic PMB potential diff --git a/doc/pair_style.txt b/doc/pair_style.txt index 0db8457ea5..1943b32c99 100644 --- a/doc/pair_style.txt +++ b/doc/pair_style.txt @@ -133,6 +133,7 @@ the pair_style command, and coefficients specified by the associated "pair_style lj/cut/coul/long/gpu"_pair_lj.html - GPU-enabled version of LJ with long-range Coulomb "pair_style lj/cut/coul/long/tip4p"_pair_lj.html - LJ with long-range Coulomb for TIP4P water "pair_style lj/expand"_pair_lj_expand.html - Lennard-Jones for variable size particles +"pair_style lj/expand/gpu"_pair_lj_expand.html - GPU-enabled version of lj/expand "pair_style lj/gromacs"_pair_gromacs.html - GROMACS-style Lennard-Jones potential "pair_style lj/gromacs/coul/gromacs"_pair_gromacs.html - GROMACS-style LJ and Coulombic potential "pair_style lj/smooth"_pair_lj_smooth.html - smoothed Lennard-Jones potential @@ -141,6 +142,7 @@ the pair_style command, and coefficients specified by the associated "pair_style lubricate"_pair_lubricate.html - hydrodynamic lubrication forces "pair_style meam"_pair_meam.html - modified embedded atom method (MEAM) "pair_style morse"_pair_morse.html - Morse potential +"pair_style morse/gpu"_pair_morse.html - GPU-enabled version of Morse potential "pair_style morse/opt"_pair_morse.html - optimized version of Morse potential "pair_style peri/lps"_pair_peri.html - peridynamic LPS potential "pair_style peri/pmb"_pair_peri.html - peridynamic PMB potential From 2be078632da846d4e0893bcebe5ee4d03d4da490 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 2 May 2011 15:02:09 +0000 Subject: [PATCH 16/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6052 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- tools/restart2data.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/restart2data.cpp b/tools/restart2data.cpp index ba346a5087..70500c7d02 100644 --- a/tools/restart2data.cpp +++ b/tools/restart2data.cpp @@ -1860,7 +1860,8 @@ void pair(FILE *fp, Data &data, char *style, int flag) } } - } else if (strcmp(style,"lj/expand") == 0) { + } else if ((strcmp(style,"lj/expand") == 0) || + (strcmp(style,"lj/expand/gpu") == 0)) { double cut_global = read_double(fp); int offset_flag = read_int(fp); @@ -1981,6 +1982,7 @@ void pair(FILE *fp, Data &data, char *style, int flag) } else if (strcmp(style,"meam") == 0) { } else if ((strcmp(style,"morse") == 0) || + (strcmp(style,"morse/gpu") == 0) || (strcmp(style,"morse/opt") == 0)) { double cut_global = read_double(fp); @@ -2837,7 +2839,8 @@ void Data::write(FILE *fp, FILE *fp2) fprintf(fp,"%d %g %g\n",i, pair_lj_epsilon[i],pair_lj_sigma[i]); - } else if (strcmp(pair_style,"lj/expand") == 0) { + } else if ((strcmp(pair_style,"lj/expand") == 0) || + (strcmp(pair_style,"lj/expand/gpu")==0)) { for (int i = 1; i <= ntypes; i++) fprintf(fp,"%d %g %g %g\n",i, pair_ljexpand_epsilon[i],pair_ljexpand_sigma[i], @@ -2855,6 +2858,7 @@ void Data::write(FILE *fp, FILE *fp2) pair_ljsmooth_epsilon[i],pair_ljsmooth_sigma[i]); } else if ((strcmp(pair_style,"morse") == 0) || + (strcmp(pair_style,"morse/gpu") == 0) || (strcmp(pair_style,"morse/opt") == 0)) { for (int i = 1; i <= ntypes; i++) fprintf(fp,"%d %g %g %g\n",i, From 5f799182b3822786373f4e10b43a405711bb27d2 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 2 May 2011 15:02:52 +0000 Subject: [PATCH 17/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6053 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/Makefile.fermi | 2 +- lib/gpu/Makefile.lens | 6 +- lib/gpu/Makefile.lincoln | 2 +- lib/gpu/Makefile.linux | 2 +- lib/gpu/Makefile.linux_opencl | 2 +- lib/gpu/Makefile.longhorn | 2 +- lib/gpu/Makefile.mac | 2 +- lib/gpu/Makefile.mac_opencl | 2 +- lib/gpu/Nvidia.makefile | 103 +++++- lib/gpu/Opencl.makefile | 84 +++-- lib/gpu/README | 1 + lib/gpu/atomic_gpu_memory.cpp | 133 ++++--- lib/gpu/atomic_gpu_memory.h | 66 ++-- lib/gpu/charge_gpu_memory.cpp | 140 +++++--- lib/gpu/charge_gpu_memory.h | 72 ++-- lib/gpu/cmm_cut_gpu.cpp | 68 ++-- lib/gpu/cmm_cut_gpu_kernel.cu | 199 ++++++++--- lib/gpu/cmm_cut_gpu_memory.cpp | 49 +-- lib/gpu/cmm_cut_gpu_memory.h | 21 +- lib/gpu/cmmc_long_gpu.cpp | 82 ++--- lib/gpu/cmmc_long_gpu_kernel.cu | 208 ++++++++--- lib/gpu/cmmc_long_gpu_memory.cpp | 57 +-- lib/gpu/cmmc_long_gpu_memory.h | 25 +- lib/gpu/crml_gpu.cpp | 95 ++--- lib/gpu/crml_gpu_kernel.cu | 212 ++++++++--- lib/gpu/crml_gpu_memory.cpp | 42 ++- lib/gpu/crml_gpu_memory.h | 29 +- lib/gpu/gb_gpu.cpp | 203 ++++++----- lib/gpu/gb_gpu_extra.h | 5 +- lib/gpu/gb_gpu_kernel.cu | 535 ++++++++++++++------------- lib/gpu/gb_gpu_kernel_lj.cu | 261 ++++++++++---- lib/gpu/gb_gpu_kernel_nbor.cu | 5 +- lib/gpu/gb_gpu_memory.cpp | 94 +++-- lib/gpu/gb_gpu_memory.h | 75 ++-- lib/gpu/geryon/VERSION.txt | 4 +- lib/gpu/geryon/nvc_device.h | 4 +- lib/gpu/geryon/nvd_device.h | 16 +- lib/gpu/geryon/nvd_timer.h | 12 + lib/gpu/geryon/ocl_timer.h | 12 + lib/gpu/geryon/ucl_arg_kludge.h | 597 ++++++++++++++++++++++++++++++- lib/gpu/geryon/ucl_d_mat.h | 40 ++- lib/gpu/geryon/ucl_d_vec.h | 35 +- lib/gpu/geryon/ucl_h_mat.h | 44 ++- lib/gpu/geryon/ucl_h_vec.h | 40 ++- lib/gpu/geryon/ucl_nv_kernel.h | 19 +- lib/gpu/lj96_cut_gpu.cpp | 68 ++-- lib/gpu/lj96_cut_gpu_kernel.cu | 197 +++++++--- lib/gpu/lj96_cut_gpu_memory.cpp | 35 +- lib/gpu/lj96_cut_gpu_memory.h | 21 +- lib/gpu/lj_cut_gpu.cpp | 67 ++-- lib/gpu/lj_cut_gpu_kernel.cu | 197 +++++++--- lib/gpu/lj_cut_gpu_memory.cpp | 49 +-- lib/gpu/lj_cut_gpu_memory.h | 21 +- lib/gpu/ljc_cut_gpu.cpp | 82 ++--- lib/gpu/ljc_cut_gpu_kernel.cu | 209 ++++++++--- lib/gpu/ljc_cut_gpu_memory.cpp | 52 +-- lib/gpu/ljc_cut_gpu_memory.h | 25 +- lib/gpu/ljcl_cut_gpu.cpp | 82 ++--- lib/gpu/ljcl_cut_gpu_kernel.cu | 208 ++++++++--- lib/gpu/ljcl_cut_gpu_memory.cpp | 35 +- lib/gpu/ljcl_cut_gpu_memory.h | 25 +- lib/gpu/pair_gpu_atom.cpp | 360 +++---------------- lib/gpu/pair_gpu_atom.h | 270 +++++++------- lib/gpu/pair_gpu_balance.h | 87 ++--- lib/gpu/pair_gpu_build_kernel.cu | 77 ++-- lib/gpu/pair_gpu_device.cpp | 448 ++++++++++++++++++++--- lib/gpu/pair_gpu_device.h | 181 +++++++++- lib/gpu/pair_gpu_nbor.cpp | 196 +++++----- lib/gpu/pair_gpu_nbor.h | 41 +-- lib/gpu/pair_gpu_precision.h | 2 - 70 files changed, 4489 insertions(+), 2253 deletions(-) diff --git a/lib/gpu/Makefile.fermi b/lib/gpu/Makefile.fermi index d830c8924c..98c823cf40 100644 --- a/lib/gpu/Makefile.fermi +++ b/lib/gpu/Makefile.fermi @@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include +CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON BIN_DIR = ./ diff --git a/lib/gpu/Makefile.lens b/lib/gpu/Makefile.lens index 3b6301277f..d049967c5f 100644 --- a/lib/gpu/Makefile.lens +++ b/lib/gpu/Makefile.lens @@ -17,16 +17,16 @@ # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ -CUDA_HOME = /sw/analysis-x64/cuda/3.0/sl5.0_binary/ +CUDA_HOME = /sw/analysis-x64/cuda/3.2/centos5.5_binary/ NVCC = nvcc CUDA_ARCH = -arch=sm_13 -CUDA_PRECISION = -D_SINGLE_SINGLE +CUDA_PRECISION = -D_SINGLE_DOUBLE CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON -openmp +CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -openmp CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias BIN_DIR = ./ diff --git a/lib/gpu/Makefile.lincoln b/lib/gpu/Makefile.lincoln index 97a7901811..bbaca61ef1 100644 --- a/lib/gpu/Makefile.lincoln +++ b/lib/gpu/Makefile.lincoln @@ -24,7 +24,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON +CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops BIN_DIR = ./ diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux index c0001a54ab..d69a00a817 100644 --- a/lib/gpu/Makefile.linux +++ b/lib/gpu/Makefile.linux @@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK +CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias BIN_DIR = ./ diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl index 69522298c5..3d65c9dc48 100644 --- a/lib/gpu/Makefile.linux_opencl +++ b/lib/gpu/Makefile.linux_opencl @@ -17,7 +17,7 @@ # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ -OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK +OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK OCL_LINK = -lOpenCL OCL_PREC = -D_SINGLE_SINGLE diff --git a/lib/gpu/Makefile.longhorn b/lib/gpu/Makefile.longhorn index ba921f0f68..cc41174332 100644 --- a/lib/gpu/Makefile.longhorn +++ b/lib/gpu/Makefile.longhorn @@ -23,7 +23,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB) CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK +CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias BIN_DIR = ./ diff --git a/lib/gpu/Makefile.mac b/lib/gpu/Makefile.mac index f061a1a68a..5276ac10b2 100644 --- a/lib/gpu/Makefile.mac +++ b/lib/gpu/Makefile.mac @@ -24,7 +24,7 @@ CUDA_ARCH = -arch=sm_11 CUDA_PRECISION = -D_SINGLE_SINGLE CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib -CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32 +CUDA_OPTS = -DUNIX -DUCL_NO_EXIT -O3 -Xptxas -v --use_fast_math -m32 CUDR_CPP = mpic++ CUDR_OPTS = -O2 -m32 -g diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl index 53d6d466e2..50ed67e9c3 100644 --- a/lib/gpu/Makefile.mac_opencl +++ b/lib/gpu/Makefile.mac_opencl @@ -17,7 +17,7 @@ # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ -OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON +OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON -DUCL_NO_EXIT OCL_LINK = -framework OpenCL OCL_PREC = -D_SINGLE_SINGLE diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index adf281e156..17f616ab37 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -13,7 +13,8 @@ # # /* ---------------------------------------------------------------------- # Contributing authors: Mike Brown (ORNL), brownw@ornl.gov -# Peng Wang (Nvidia), penwang@nvidia.com +# Peng Wang (Nvidia), penwang@nvidia.com +# Inderaj Bains (NVIDIA), ibains@nvidia.com # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ @@ -28,10 +29,11 @@ GPU_LIB = $(LIB_DIR)/libgpu.a # Headers for Geryon UCL_H = $(wildcard ./geryon/ucl*.h) NVC_H = $(wildcard ./geryon/nvc*.h) $(UCL_H) -NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) +NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) nv_kernel_def.h # Headers for Pair Stuff -PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \ - pair_gpu_device.h pair_gpu_balance.h +PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \ + pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \ + pair_gpu_balance.h pppm_gpu_memory.h ALL_H = $(NVD_H) $(PAIR_H) @@ -39,28 +41,37 @@ EXECS = $(BIN_DIR)/nvc_get_devices CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \ $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \ $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o -OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \ - $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \ - $(OBJ_DIR)/charge_gpu_memory.o \ +OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ + $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \ + $(OBJ_DIR)/pair_gpu_device.o \ + $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \ + $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \ $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \ $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \ $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \ + $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \ $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \ $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \ + $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \ $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \ $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \ $(CUDPP) -PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \ +PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \ + $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \ $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \ $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \ + $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h \ + $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h \ $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \ $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \ $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \ $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \ + $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h \ $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \ $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \ - $(OBJ_DIR)/crml_cut_gpu_kernel.ptx $(OBJ_DIR)/crml_cut_gpu_ptx.h \ + $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h \ + $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \ $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \ $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h @@ -93,6 +104,9 @@ $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h $(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H) + $(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu @@ -105,11 +119,20 @@ $(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu $(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h -$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H) +$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H) + $(CUDR) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H) $(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H) - $(CUDR) -o $@ -c pair_gpu_device.cpp +$(OBJ_DIR)/pair_gpu_dev_kernel.ptx: pair_gpu_dev_kernel.cu + $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_dev_kernel.cu + +$(OBJ_DIR)/pair_gpu_dev_ptx.h: $(OBJ_DIR)/pair_gpu_dev_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_dev_kernel.ptx $(OBJ_DIR)/pair_gpu_dev_ptx.h + +$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_ptx.h + $(CUDR) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR) $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp $(CUDR) -o $@ -c atomic_gpu_memory.cpp @@ -117,6 +140,24 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.c $(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp $(CUDR) -o $@ -c charge_gpu_memory.cpp +$(OBJ_DIR)/pppm_f_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ pppm_gpu_kernel.cu + +$(OBJ_DIR)/pppm_f_gpu_ptx.h: $(OBJ_DIR)/pppm_f_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h + +$(OBJ_DIR)/pppm_d_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ pppm_gpu_kernel.cu + +$(OBJ_DIR)/pppm_d_gpu_ptx.h: $(OBJ_DIR)/pppm_d_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h + +$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_f_gpu_ptx.h $(OBJ_DIR)/pppm_d_gpu_ptx.h + $(CUDR) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp + $(CUDR) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h $(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu @@ -144,7 +185,7 @@ $(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_ $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp +$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h $(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h @@ -156,7 +197,7 @@ $(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_c $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o $(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp +$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h $(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h @@ -168,9 +209,21 @@ $(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o $(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp +$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h $(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/morse_gpu_kernel.ptx: morse_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -o $@ morse_gpu_kernel.cu + +$(OBJ_DIR)/morse_gpu_ptx.h: $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h + +$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o + $(CUDR) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h + $(CUDR) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu @@ -180,7 +233,7 @@ $(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ke $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o $(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp +$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h $(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h @@ -192,9 +245,21 @@ $(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj9 $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp +$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h $(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/lj_expand_gpu_kernel.ptx: lj_expand_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -o $@ lj_expand_gpu_kernel.cu + +$(OBJ_DIR)/lj_expand_gpu_ptx.h: $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h + +$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o + $(CUDR) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h + $(CUDR) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu @@ -204,7 +269,7 @@ $(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_c $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp +$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h $(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h @@ -216,7 +281,7 @@ $(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/c $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp +$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h $(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR) $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H) diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index ac7aecc2ee..45e21736a3 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -14,6 +14,7 @@ # /* ---------------------------------------------------------------------- # Contributing authors: Mike Brown (ORNL), brownw@ornl.gov # Peng Wang (Nvidia), penwang@nvidia.com +# Inderaj Bains (NVIDIA), ibains@nvidia.com # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ @@ -23,30 +24,37 @@ OCL_LIB = $(LIB_DIR)/libgpu.a UCL_H = $(wildcard ./geryon/ucl*.h) OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) # Headers for Pair Stuff -PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \ - pair_gpu_device.h pair_gpu_balance.h +PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \ + pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \ + pair_gpu_balance.h pppm_gpu_memory.h ALL_H = $(OCL_H) $(PAIR_H) EXECS = $(BIN_DIR)/ocl_get_devices -OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \ - $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \ - $(OBJ_DIR)/charge_gpu_memory.o \ +OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ + $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \ + $(OBJ_DIR)/pair_gpu_device.o \ + $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \ + $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \ $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \ $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \ $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \ + $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \ $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \ $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \ + $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \ $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \ $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o -KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \ +KERS = $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_atom_cl.h \ + $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pppm_gpu_cl.h \ $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \ $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \ - $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \ - $(OBJ_DIR)/crml_gpu_cl.h \ - $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h - + $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h \ + $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/morse_gpu_cl.h \ + $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h \ + $(OBJ_DIR)/cmmc_long_gpu_cl.h + OCL_EXECS = $(BIN_DIR)/ocl_get_devices all: $(OCL_LIB) $(EXECS) @@ -57,14 +65,23 @@ $(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h $(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(OCL_H) + $(OCL) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu $(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h -$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h +$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h + $(OCL) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h $(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H) - $(OCL) -o $@ -c pair_gpu_device.cpp +$(OBJ_DIR)/pair_gpu_dev_cl.h: pair_gpu_dev_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh pair_gpu_dev_kernel.cu $(OBJ_DIR)/pair_gpu_dev_cl.h + +$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_cl.h + $(OCL) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR) $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp $(OCL) -o $@ -c atomic_gpu_memory.cpp @@ -72,6 +89,15 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.c $(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp $(OCL) -o $@ -c charge_gpu_memory.cpp +$(OBJ_DIR)/pppm_gpu_cl.h: pppm_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_gpu_cl.h; + +$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_gpu_cl.h $(OBJ_DIR)/pppm_gpu_cl.h + $(OCL) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp + $(OCL) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu $(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h @@ -93,7 +119,7 @@ $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp +$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h $(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu @@ -102,7 +128,7 @@ $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o $(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp +$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h $(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu @@ -111,16 +137,25 @@ $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o $(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp +$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h $(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/morse_gpu_cl.h: morse_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh morse_gpu_kernel.cu $(OBJ_DIR)/morse_gpu_cl.h; + +$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o + $(OCL) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h + $(OCL) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h; $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o $(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp +$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h $(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu @@ -129,16 +164,25 @@ $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp +$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h $(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/lj_expand_gpu_cl.h: lj_expand_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh lj_expand_gpu_kernel.cu $(OBJ_DIR)/lj_expand_gpu_cl.h; + +$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o + $(OCL) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h + $(OCL) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h; $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp +$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h $(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu @@ -147,7 +191,7 @@ $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp +$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h $(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR) $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp diff --git a/lib/gpu/README b/lib/gpu/README index 567d81886b..a60d43064a 100644 --- a/lib/gpu/README +++ b/lib/gpu/README @@ -14,6 +14,7 @@ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov Peng Wang (Nvidia), penwang@nvidia.com + Inderaj Bains (NVIDIA), ibains@nvidia.com Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp index e1cc48048b..531ea4000d 100644 --- a/lib/gpu/atomic_gpu_memory.cpp +++ b/lib/gpu/atomic_gpu_memory.cpp @@ -23,23 +23,28 @@ extern PairGPUDevice pair_gpu_device; template AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) { device=&pair_gpu_device; + ans=new PairGPUAns(); + nbor=new PairGPUNbor(); } template AtomicGPUMemoryT::~AtomicGPUMemory() { + delete ans; + delete nbor; } template int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const { - return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors); + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); } template -bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, - const int max_nbors, const int maxspecial, - const double cell_size, - const double gpu_split, FILE *_screen, - const char *pair_program) { +int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen, + const char *pair_program) { nbor_time_avail=false; screen=_screen; @@ -48,24 +53,30 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, gpu_nbor=true; int _gpu_host=0; - int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split); + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); if (host_nlocal>0) _gpu_host=1; - if (!device->init(false,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor, - _gpu_host,max_nbors,cell_size,false)) - return false; + _threads_per_atom=device->threads_per_atom(); + if (_threads_per_atom>1 && gpu_nbor==false) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, + maxspecial,_gpu_host,max_nbors,cell_size,false); + if (success!=0) + return success; + ucl_device=device->gpu; atom=&device->atom; - nbor=&device->nbor; - _block_size=BLOCK_1D; - if (static_cast(_block_size)>ucl_device->group_size()) - _block_size=ucl_device->group_size(); + _block_size=device->pair_block_size(); compile_kernels(*ucl_device,pair_program); // Initialize host-device load balancer - hd_balancer.init(device,gpu_split); + hd_balancer.init(device,gpu_nbor,gpu_split); // Initialize timers for the selected GPU time_pair.init(*ucl_device); @@ -73,9 +84,14 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, pos_tex.bind_float(atom->dev_x,4); - _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); - return true; + return 0; +} + +template +void AtomicGPUMemoryT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); } template @@ -83,7 +99,10 @@ void AtomicGPUMemoryT::clear_atomic() { // Output any timing information acc_timers(); double avg_split=hd_balancer.all_avg_split(); - device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); if (_compiled) { k_pair_fast.clear(); @@ -107,8 +126,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist, success=true; nbor_time_avail=true; - - int mn=nbor->max_nbor_loop(inum,numj); + int mn=nbor->max_nbor_loop(inum,numj,ilist); resize_atom(inum,nall,success); resize_local(inum,mn,success); if (!success) @@ -116,7 +134,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist, nbor->get_host(inum,ilist,numj,firstneigh,block_size()); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; @@ -130,8 +148,8 @@ template inline void AtomicGPUMemoryT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, - int *host_type, double *boxlo, - double *boxhi, int *tag, + int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, bool &success) { nbor_time_avail=true; @@ -144,10 +162,10 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum, atom->cast_copy_x(host_x,host_type); int mn; - nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag, + nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag, nspecial, special, success, mn); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; } @@ -156,24 +174,25 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum, // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template -void AtomicGPUMemoryT::compute(const int timestep, const int f_ago, - const int inum_full, const int nall, - double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, - int &host_start, const double cpu_time, - bool &success) { +void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success) { acc_timers(); if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); zero_timers(); return; } int ago=hd_balancer.ago_first(f_ago); - int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time, - nbor->gpu_nbor()); - atom->inum(inum); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); host_start=inum; if (ago==0) { @@ -187,7 +206,8 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago, atom->add_x_data(host_x,host_type); loop(eflag,vflag); - atom->copy_answers(eflag,vflag,eatom,vatom,ilist); + ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + device->add_ans_object(ans); hd_balancer.stop_timer(); } @@ -195,29 +215,32 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago, // Reneighbor on GPU if necessary and then compute forces, virials, energies // --------------------------------------------------------------------------- template -int * AtomicGPUMemoryT::compute(const int timestep, const int ago, - const int inum_full, const int nall, - double **host_x, int *host_type, double *boxlo, - double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - const double cpu_time, bool &success) { +int ** AtomicGPUMemoryT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success) { acc_timers(); if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); zero_timers(); return NULL; } - hd_balancer.balance(cpu_time,nbor->gpu_nbor()); - int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full); - atom->inum(inum); + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); host_start=inum; // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - boxlo, boxhi, tag, nspecial, special, success); + sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; hd_balancer.start_timer(); @@ -226,19 +249,21 @@ int * AtomicGPUMemoryT::compute(const int timestep, const int ago, hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); } + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); loop(eflag,vflag); - atom->copy_answers(eflag,vflag,eatom,vatom); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); hd_balancer.stop_timer(); - return device->nbor.host_nbor.begin(); + return nbor->host_jlist.begin()-host_start; } template double AtomicGPUMemoryT::host_memory_usage_atomic() const { - return device->atom.host_memory_usage()+ - device->nbor.host_memory_usage()+4*sizeof(numtyp)+ - sizeof(AtomicGPUMemory); + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(AtomicGPUMemory); } template diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h index 81de41f3b7..238a4d9c1e 100644 --- a/lib/gpu/atomic_gpu_memory.h +++ b/lib/gpu/atomic_gpu_memory.h @@ -18,8 +18,6 @@ #ifndef ATOMIC_GPU_MEMORY_H #define ATOMIC_GPU_MEMORY_H -#define BLOCK_1D 64 - #include "pair_gpu_device.h" #include "pair_gpu_balance.h" #include "mpi.h" @@ -39,17 +37,28 @@ class AtomicGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init_atomic(const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, - const char *pair_program); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const char *pair_program); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ inline void resize_atom(const int inum, const int nall, bool &success) { - if (atom->resize(inum, nall, success)) + if (atom->resize(nall, success)) pos_tex.bind_float(atom->dev_x,4); + ans->resize(inum,success); } /// Check if there is enough storage for neighbors and realloc if not @@ -85,13 +94,16 @@ class AtomicGPUMemory { /// Accumulate timers inline void acc_timers() { - if (nbor_time_avail) { - nbor->time_nbor.add_to_total(); - nbor->time_kernel.add_to_total(); - nbor_time_avail=false; + if (device->time_device()) { + if (nbor_time_avail) { + nbor->time_nbor.add_to_total(); + nbor->time_kernel.add_to_total(); + nbor_time_avail=false; + } + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); } - time_pair.add_to_total(); - atom->acc_timers(); } /// Zero timers @@ -99,6 +111,7 @@ class AtomicGPUMemory { nbor_time_avail=false; time_pair.zero(); atom->zero_timers(); + ans->zero_timers(); } /// Copy neighbor list from host @@ -108,24 +121,32 @@ class AtomicGPUMemory { /// Build neighbor list on device void build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, bool &success); /// Pair loop with host neighboring - void compute(const int timestep, const int f_ago, const int inum_full, + void compute(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); /// Pair loop with device neighboring - int * compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, double *boxlo, - double *boxhi, int *tag, int **nspecial, + int * compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); + /// Pair loop with device neighboring + int ** compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success); + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage @@ -148,6 +169,9 @@ class AtomicGPUMemory { /// Atom Data PairGPUAtom *atom; + // ------------------------ FORCE/ENERGY DATA ----------------------- + + PairGPUAns *ans; // --------------------------- NBOR DATA ---------------------------- @@ -167,8 +191,10 @@ class AtomicGPUMemory { protected: bool _compiled; - int _block_size; + int _block_size, _threads_per_atom; double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const char *pair_string); diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp index ce43fdfda1..412596f5f2 100644 --- a/lib/gpu/charge_gpu_memory.cpp +++ b/lib/gpu/charge_gpu_memory.cpp @@ -23,23 +23,28 @@ extern PairGPUDevice pair_gpu_device; template ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) { device=&pair_gpu_device; + ans=new PairGPUAns(); + nbor=new PairGPUNbor(); } template ChargeGPUMemoryT::~ChargeGPUMemory() { + delete ans; + delete nbor; } template int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const { - return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors); + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); } template -bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, - const int max_nbors, const int maxspecial, - const double cell_size, - const double gpu_split, FILE *_screen, - const char *pair_program) { +int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen, + const char *pair_program) { nbor_time_avail=false; screen=_screen; @@ -48,24 +53,31 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, gpu_nbor=true; int _gpu_host=0; - int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split); + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); if (host_nlocal>0) _gpu_host=1; - if (!device->init(true,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor, - _gpu_host,max_nbors,cell_size,false)) - return false; + _threads_per_atom=device->threads_per_charge(); + if (_threads_per_atom>1 && gpu_nbor==false) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor, + maxspecial,_gpu_host,max_nbors,cell_size,false); + if (success!=0) + return success; + ucl_device=device->gpu; atom=&device->atom; - nbor=&device->nbor; - _block_size=BLOCK_1D; - if (static_cast(_block_size)>ucl_device->group_size()) - _block_size=ucl_device->group_size(); + _block_size=device->pair_block_size(); + _block_bio_size=device->block_bio_pair(); compile_kernels(*ucl_device,pair_program); // Initialize host-device load balancer - hd_balancer.init(device,gpu_split); + hd_balancer.init(device,gpu_nbor,gpu_split); // Initialize timers for the selected GPU time_pair.init(*ucl_device); @@ -74,9 +86,14 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, pos_tex.bind_float(atom->dev_x,4); q_tex.bind_float(atom->dev_q,1); - _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); - return true; + return success; +} + +template +void ChargeGPUMemoryT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); } template @@ -84,7 +101,10 @@ void ChargeGPUMemoryT::clear_atomic() { // Output any timing information acc_timers(); double avg_split=hd_balancer.all_avg_split(); - device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); if (_compiled) { k_pair_fast.clear(); @@ -109,7 +129,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist, nbor_time_avail=true; - int mn=nbor->max_nbor_loop(inum,numj); + int mn=nbor->max_nbor_loop(inum,numj,ilist); resize_atom(inum,nall,success); resize_local(inum,mn,success); if (!success) @@ -117,7 +137,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist, nbor->get_host(inum,ilist,numj,firstneigh,block_size()); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; @@ -131,8 +151,8 @@ template inline void ChargeGPUMemoryT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, - int *host_type, double *boxlo, - double *boxhi, int *tag, + int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, bool &success) { nbor_time_avail=true; @@ -145,10 +165,10 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum, atom->cast_copy_x(host_x,host_type); int mn; - nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag, + nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag, nspecial, special, success, mn); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; } @@ -157,24 +177,26 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum, // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template -void ChargeGPUMemoryT::compute(const int timestep, const int f_ago, - const int inum_full, const int nall, - double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, - int &host_start, const double cpu_time, - bool &success, double *host_q) { +void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { acc_timers(); if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); zero_timers(); return; } int ago=hd_balancer.ago_first(f_ago); - int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time, - nbor->gpu_nbor()); - atom->inum(inum); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); host_start=inum; if (ago==0) { @@ -187,10 +209,14 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago, atom->cast_q_data(host_q); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); - atom->add_other_data(); + atom->add_q_data(); + + device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, + boxlo, prd); loop(eflag,vflag); - atom->copy_answers(eflag,vflag,eatom,vatom,ilist); + ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + device->add_ans_object(ans); hd_balancer.stop_timer(); } @@ -198,30 +224,33 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago, // Reneighbor on GPU if necessary and then compute forces, virials, energies // --------------------------------------------------------------------------- template -int * ChargeGPUMemoryT::compute(const int timestep, const int ago, - const int inum_full, const int nall, - double **host_x, int *host_type, double *boxlo, - double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, +int** ChargeGPUMemoryT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, - double *host_q) { + double *host_q, double *boxlo, double *prd) { acc_timers(); if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); zero_timers(); return NULL; } - hd_balancer.balance(cpu_time,nbor->gpu_nbor()); - int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full); - atom->inum(inum); + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); host_start=inum; // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - boxlo, boxhi, tag, nspecial, special, success); + sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; atom->cast_q_data(host_q); @@ -232,20 +261,25 @@ int * ChargeGPUMemoryT::compute(const int timestep, const int ago, hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); } - atom->add_other_data(); + atom->add_q_data(); + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, + boxlo, prd); loop(eflag,vflag); - atom->copy_answers(eflag,vflag,eatom,vatom); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); hd_balancer.stop_timer(); - return device->nbor.host_nbor.begin(); + return nbor->host_jlist.begin()-host_start; } template double ChargeGPUMemoryT::host_memory_usage_atomic() const { - return device->atom.host_memory_usage()+ - device->nbor.host_memory_usage()+4*sizeof(numtyp)+ - sizeof(ChargeGPUMemory); + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(ChargeGPUMemory); } template diff --git a/lib/gpu/charge_gpu_memory.h b/lib/gpu/charge_gpu_memory.h index d18857e4d6..768f0e0c08 100644 --- a/lib/gpu/charge_gpu_memory.h +++ b/lib/gpu/charge_gpu_memory.h @@ -18,8 +18,6 @@ #ifndef CHARGE_GPU_MEMORY_H #define CHARGE_GPU_MEMORY_H -#define BLOCK_1D 64 - #include "pair_gpu_device.h" #include "pair_gpu_balance.h" #include "mpi.h" @@ -39,19 +37,30 @@ class ChargeGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init_atomic(const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, - const char *pair_program); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const char *pair_program); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ inline void resize_atom(const int inum, const int nall, bool &success) { - if (atom->resize(inum, nall, success)) { + if (atom->resize(nall, success)) { pos_tex.bind_float(atom->dev_x,4); q_tex.bind_float(atom->dev_q,1); } + ans->resize(inum,success); } /// Check if there is enough storage for neighbors and realloc if not @@ -87,13 +96,16 @@ class ChargeGPUMemory { /// Accumulate timers inline void acc_timers() { - if (nbor_time_avail) { - nbor->time_nbor.add_to_total(); - nbor->time_kernel.add_to_total(); - nbor_time_avail=false; + if (device->time_device()) { + if (nbor_time_avail) { + nbor->time_nbor.add_to_total(); + nbor->time_kernel.add_to_total(); + nbor_time_avail=false; + } + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); } - time_pair.add_to_total(); - atom->acc_timers(); } /// Zero timers @@ -101,6 +113,7 @@ class ChargeGPUMemory { nbor_time_avail=false; time_pair.zero(); atom->zero_timers(); + ans->zero_timers(); } /// Copy neighbor list from host @@ -110,24 +123,25 @@ class ChargeGPUMemory { /// Build neighbor list on device void build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, bool &success); /// Pair loop with host neighboring - void compute(const int timestep, const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, const double cpu_time, bool &success, - double *charge); + void compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *charge, + const int nlocal, double *boxlo, double *prd); /// Pair loop with device neighboring - int * compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, double *boxlo, - double *boxhi, int *tag, int **nspecial, + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *charge); + int **ilist, int **numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd); // -------------------------- DEVICE DATA ------------------------- @@ -152,6 +166,10 @@ class ChargeGPUMemory { PairGPUAtom *atom; + // ------------------------ FORCE/ENERGY DATA ----------------------- + + PairGPUAns *ans; + // --------------------------- NBOR DATA ---------------------------- /// Neighbor data @@ -171,8 +189,10 @@ class ChargeGPUMemory { protected: bool _compiled; - int _block_size; + int _block_size, _block_bio_size, _threads_per_atom; double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const char *pair_string); diff --git a/lib/gpu/cmm_cut_gpu.cpp b/lib/gpu/cmm_cut_gpu.cpp index 53976ff7e8..7be958615a 100644 --- a/lib/gpu/cmm_cut_gpu.cpp +++ b/lib/gpu/cmm_cut_gpu.cpp @@ -28,12 +28,12 @@ static CMM_GPU_Memory CMMMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen) { +int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen) { CMMMF.clear(); gpu_mode=CMMMF.device->gpu_mode(); double gpu_split=CMMMF.device->particle_split(); @@ -54,13 +54,11 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, fflush(screen); } - if (world_me==0) { - bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); CMMMF.device->world_barrier(); if (message) @@ -75,45 +73,45 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, - screen); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + CMMMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + CMMMF.estimate_gpu_overhead(); + return init_ok; } void cmm_gpu_clear() { CMMMF.clear(); } -int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int** cmm_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success) { - return CMMMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success); + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); } -void cmm_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success) { - CMMMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, +void cmm_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu index 47504f621e..08cc31ed7f 100644 --- a/lib/gpu/cmm_cut_gpu_kernel.cu +++ b/lib/gpu/cmm_cut_gpu_kernel.cu @@ -18,8 +18,6 @@ #ifndef CMM_GPU_KERNEL #define CMM_GPU_KERNEL -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -46,7 +44,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; #ifdef _DOUBLE_DOUBLE @@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define __inline inline #define fetch_pos(i,y) x_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif @@ -82,40 +82,56 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -183,49 +238,64 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in,__global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + __global numtyp* sp_lj_in,__global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; - if (ii<4) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/cmm_cut_gpu_memory.cpp b/lib/gpu/cmm_cut_gpu_memory.cpp index e5a83e5872..8a5949c9e7 100644 --- a/lib/gpu/cmm_cut_gpu_memory.cpp +++ b/lib/gpu/cmm_cut_gpu_memory.cpp @@ -42,22 +42,26 @@ int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, - int **host_cg_type, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,cmm_cut_gpu_kernel); +int CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,cmm_cut_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int cmm_types=ntypes; shared_types=false; - if (cmm_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { - cmm_types=MAX_SHARED_TYPES; + int max_shared_types=this->device->max_shared_types(); + if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) { + cmm_types=max_shared_types; shared_types=true; } _cmm_types=cmm_types; @@ -84,7 +88,7 @@ bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); - return true; + return 0; } template @@ -122,9 +126,10 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -133,16 +138,18 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, - &ainum, &anall, &nbor_pitch); + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &anall, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, - &anall, &nbor_pitch); + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &anall, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/cmm_cut_gpu_memory.h b/lib/gpu/cmm_cut_gpu_memory.h index 8099d5b9c4..fff90e477d 100644 --- a/lib/gpu/cmm_cut_gpu_memory.h +++ b/lib/gpu/cmm_cut_gpu_memory.h @@ -29,13 +29,20 @@ class CMM_GPU_Memory : public AtomicGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double **host_cutsq, int **host_cg_type, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, int **host_cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/cmmc_long_gpu.cpp b/lib/gpu/cmmc_long_gpu.cpp index a3fcf336c6..a6f3d090af 100644 --- a/lib/gpu/cmmc_long_gpu.cpp +++ b/lib/gpu/cmmc_long_gpu.cpp @@ -28,14 +28,14 @@ static CMML_GPU_Memory CMMLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald) { +int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { CMMLMF.clear(); gpu_mode=CMMLMF.device->gpu_mode(); double gpu_split=CMMLMF.device->particle_split(); @@ -56,15 +56,12 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, fflush(screen); } - if (world_me==0) { - bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, - host_lj3, host_lj4, offset, special_lj, inum, - nall, 300, maxspecial, cell_size, gpu_split, - screen, host_cut_ljsq, host_cut_coulsq, - host_special_coul, qqrd2e,g_ewald); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e,g_ewald); CMMLMF.device->world_barrier(); if (message) @@ -79,48 +76,51 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, - host_lj3, host_lj4, offset, special_lj, inum, - nall, 300, maxspecial, cell_size, gpu_split, - screen, host_cut_ljsq, host_cut_coulsq, - host_special_coul, qqrd2e, g_ewald); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, + host_cut_ljsq, host_cut_coulsq, host_special_coul, + qqrd2e, g_ewald); CMMLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + CMMLMF.estimate_gpu_overhead(); + return init_ok; } void cmml_gpu_clear() { CMMLMF.clear(); } -int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int** cmml_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q) { - return CMMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success, host_q); + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q,boxlo,prd); } -void cmml_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q) { - CMMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, +void cmml_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, - host_q); + host_q,nlocal,boxlo,prd); } double cmml_gpu_bytes() { diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu index 4a19b5fe03..5153cb5016 100644 --- a/lib/gpu/cmmc_long_gpu_kernel.cu +++ b/lib/gpu/cmmc_long_gpu_kernel.cu @@ -18,8 +18,6 @@ #ifndef CMML_GPU_KERNEL #define CMML_GPU_KERNEL -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -54,7 +52,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; texture q_tex; @@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif @@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, __global numtyp *q_ , + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -117,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -234,51 +291,67 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const numtyp qqrd2e, const numtyp g_ewald, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/cmmc_long_gpu_memory.cpp b/lib/gpu/cmmc_long_gpu_memory.cpp index 9a63bc5628..e2f99fceca 100644 --- a/lib/gpu/cmmc_long_gpu_memory.cpp +++ b/lib/gpu/cmmc_long_gpu_memory.cpp @@ -43,26 +43,30 @@ int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, - int **host_cg_type, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen, - double **host_cut_ljsq, - const double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,cmmc_long_gpu_kernel); +int CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, + const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,cmmc_long_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; @@ -95,7 +99,7 @@ bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); - return true; + return 0; } template @@ -133,9 +137,10 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -144,19 +149,21 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, - &_qqrd2e, &_g_ewald); + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), - &_cut_coulsq, &_qqrd2e, &_g_ewald); + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/cmmc_long_gpu_memory.h b/lib/gpu/cmmc_long_gpu_memory.h index 8192c78249..45090368a5 100644 --- a/lib/gpu/cmmc_long_gpu_memory.h +++ b/lib/gpu/cmmc_long_gpu_memory.h @@ -29,15 +29,22 @@ class CMML_GPU_Memory : public ChargeGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double **host_cutsq, int ** cg_type, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, double **host_cut_ljsq, - const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, int ** cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/crml_gpu.cpp b/lib/gpu/crml_gpu.cpp index 7458300907..1e59562ed5 100644 --- a/lib/gpu/crml_gpu.cpp +++ b/lib/gpu/crml_gpu.cpp @@ -28,16 +28,16 @@ static CRML_GPU_Memory CRMLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald, const double cut_lj_innersq, - const double denom_lj, double **epsilon, - double **sigma, const bool mix_arithmetic) { +int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald, const double cut_lj_innersq, + const double denom_lj, double **epsilon, + double **sigma, const bool mix_arithmetic) { CRMLMF.clear(); gpu_mode=CRMLMF.device->gpu_mode(); double gpu_split=CRMLMF.device->particle_split(); @@ -58,16 +58,13 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, fflush(screen); } - if (world_me==0) { - bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen, - host_cut_ljsq, host_cut_coulsq, host_special_coul, - qqrd2e, g_ewald, cut_lj_innersq, denom_lj, - epsilon,sigma,mix_arithmetic); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_ljsq, host_cut_coulsq, + host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj, + epsilon,sigma,mix_arithmetic); CRMLMF.device->world_barrier(); if (message) @@ -82,50 +79,54 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, - screen, host_cut_ljsq, host_cut_coulsq, - host_special_coul, qqrd2e, g_ewald, - cut_lj_innersq, denom_lj, epsilon, sigma, - mix_arithmetic); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, + host_cut_ljsq, host_cut_coulsq, host_special_coul, + qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon, + sigma, mix_arithmetic); + CRMLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + CRMLMF.estimate_gpu_overhead(); + return init_ok; } void crml_gpu_clear() { CRMLMF.clear(); } -int * crml_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int** crml_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q) { - return CRMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success, host_q); + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); } -void crml_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q) { - CRMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, - firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, - host_q); +void crml_gpu_compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q, const int nlocal, + double *boxlo, double *prd) { + CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, + eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); } double crml_gpu_bytes() { diff --git a/lib/gpu/crml_gpu_kernel.cu b/lib/gpu/crml_gpu_kernel.cu index 6ba6eaedca..63ce924581 100644 --- a/lib/gpu/crml_gpu_kernel.cu +++ b/lib/gpu/crml_gpu_kernel.cu @@ -54,7 +54,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; texture q_tex; @@ -90,6 +90,7 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_BIO_PAIR 64 #endif @@ -98,18 +99,22 @@ __inline float fetch_q(const int& i, const float *q) __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, - const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, + const int lj_types, __global numtyp *sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const numtyp denom_lj, const numtyp cut_bothsq, - const numtyp cut_ljsq, const numtyp cut_lj_innersq) { + const numtyp cut_ljsq, const numtyp cut_lj_innersq, + const int t_per_atom) { + + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -120,29 +125,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -240,50 +298,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, __global numtyp* sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_, const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald, - const numtyp denom_lj, const numtyp cut_bothsq, - const numtyp cut_ljsq, - const numtyp cut_lj_innersq) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const numtyp denom_lj, + const numtyp cut_bothsq, const numtyp cut_ljsq, + const numtyp cut_lj_innersq, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp2 ljd[MAX_BIO_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - ljd[ii]=ljd_in[ii]; - ljd[ii+64]=ljd_in[ii+64]; - - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + ljd[tid]=ljd_in[tid]; + if (tid+BLOCK_BIO_PAIR1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/crml_gpu_memory.cpp b/lib/gpu/crml_gpu_memory.cpp index e877503e87..6661f67585 100644 --- a/lib/gpu/crml_gpu_memory.cpp +++ b/lib/gpu/crml_gpu_memory.cpp @@ -43,7 +43,7 @@ int CRML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool CRML_GPU_MemoryT::init(const int ntypes, +int CRML_GPU_MemoryT::init(const int ntypes, double host_cut_bothsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, @@ -56,20 +56,24 @@ bool CRML_GPU_MemoryT::init(const int ntypes, const double g_ewald, const double cut_lj_innersq, const double denom_lj, double **epsilon, double **sigma, const bool mix_arithmetic) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,crml_gpu_kernel); + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,crml_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (this->_block_size>=64 && mix_arithmetic) + if (this->_block_bio_size>=64 && mix_arithmetic) shared_types=true; _lj_types=lj_types; // Allocate a host write buffer for data initialization int h_size=lj_types*lj_types; - if (h_sizedevice->max_bio_shared_types(); + if (h_size host_write(h_size*32,*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int i=0; iatom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, host_lj3,host_lj4); - ljd.alloc(MAX_BIO_SHARED_TYPES,*(this->ucl_device),UCL_READ_ONLY); + ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma); sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); @@ -99,7 +103,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes, _allocated=true; this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes(); - return true; + return 0; } template @@ -125,7 +129,7 @@ double CRML_GPU_MemoryT::host_memory_usage() const { template void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { // Compute the block size and grid size to keep all cores busy - const int BX=this->block_size(); + const int BX=this->_block_bio_size; int eflag, vflag; if (_eflag) eflag=1; @@ -137,9 +141,10 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -147,21 +152,24 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq, - &_cut_ljsq, &_cut_lj_innersq); + &_cut_ljsq, &_cut_lj_innersq, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, - &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq); + &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, + &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/crml_gpu_memory.h b/lib/gpu/crml_gpu_memory.h index 5520cd3a17..a474d5982d 100644 --- a/lib/gpu/crml_gpu_memory.h +++ b/lib/gpu/crml_gpu_memory.h @@ -29,17 +29,24 @@ class CRML_GPU_Memory : public ChargeGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double host_cut_bothsq, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, double host_cut_ljsq, - const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald, - const double cut_lj_innersq, const double denom_lj, - double **epsilon, double **sigma, const bool mix_arithmetic); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double host_cut_bothsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald, + const double cut_lj_innersq, const double denom_lj, + double **epsilon, double **sigma, const bool mix_arithmetic); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/gb_gpu.cpp b/lib/gpu/gb_gpu.cpp index 5ca88fd70f..70eb4d9344 100644 --- a/lib/gpu/gb_gpu.cpp +++ b/lib/gpu/gb_gpu.cpp @@ -49,14 +49,14 @@ void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool gb_gpu_init(const int ntypes, const double gamma, - const double upsilon, const double mu, double **shape, - double **well, double **cutsq, double **sigma, - double **epsilon, double *host_lshape, int **form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const double cell_size, int &gpu_mode, FILE *screen) { +int gb_gpu_init(const int ntypes, const double gamma, + const double upsilon, const double mu, double **shape, + double **well, double **cutsq, double **sigma, + double **epsilon, double *host_lshape, int **form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const double cell_size, int &gpu_mode, FILE *screen) { GBMF.clear(); gpu_mode=GBMF.device->gpu_mode(); double gpu_split=GBMF.device->particle_split(); @@ -77,14 +77,12 @@ bool gb_gpu_init(const int ntypes, const double gamma, fflush(screen); } - if (world_me==0) { - bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, - sigma, epsilon, host_lshape, form, host_lj1, - host_lj2, host_lj3, host_lj4, offset, special_lj, - inum, nall, max_nbors, cell_size, gpu_split, screen); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, + sigma, epsilon, host_lshape, form, host_lj1, + host_lj2, host_lj3, host_lj4, offset, special_lj, + inum, nall, max_nbors, cell_size, gpu_split, screen); GBMF.device->world_barrier(); if (message) @@ -99,22 +97,22 @@ bool gb_gpu_init(const int ntypes, const double gamma, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, - sigma, epsilon, host_lshape, form, host_lj1, - host_lj2, host_lj3, host_lj4, offset, special_lj, - inum, nall, max_nbors, cell_size, gpu_split, - screen); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, sigma, + epsilon, host_lshape, form, host_lj1, host_lj2, + host_lj3, host_lj4, offset, special_lj, inum, nall, + max_nbors, cell_size, gpu_split, screen); + GBMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + GBMF.estimate_gpu_overhead(); + return init_ok; } // --------------------------------------------------------------------------- @@ -131,8 +129,8 @@ template inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum, const int host_inum, const int nall, double **host_x, double **host_quat, - int *host_type, double *boxlo, - double *boxhi, bool &success) { + int *host_type, double *sublo, + double *subhi, bool &success) { gbm.nbor_time_avail=true; success=true; @@ -144,7 +142,7 @@ inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum, gbm.atom->cast_copy_x(host_x,host_type); int mn; gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom, - boxlo, boxhi, NULL, NULL, NULL, success, mn); + sublo, subhi, NULL, NULL, NULL, success, mn); gbm.nbor->copy_unpacked(inum,mn); gbm.last_ellipse=inum; gbm.max_last_ellipse=inum; @@ -163,7 +161,7 @@ void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, gbm.nbor_time_avail=true; - int mn=gbm.nbor->max_nbor_loop(inum,numj); + int mn=gbm.nbor->max_nbor_loop(inum,numj,ilist); gbm.resize_atom(inum,nall,success); gbm.resize_local(inum,0,mn,osize,success); if (!success) @@ -216,9 +214,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(gbm.atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(gbm.ans->inum())/ + (BX/gbm._threads_per_atom))); int stride=gbm.nbor->nbor_pitch(); - int ainum=gbm.atom->inum(); + int ainum=gbm.ans->inum(); int anall=gbm.atom->nall(); if (gbm.multiple_forms) { @@ -226,7 +225,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { if (gbm.last_ellipse>0) { // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- GX=static_cast(ceil(static_cast(gbm.last_ellipse)/ - static_cast(BX))); + (BX/gbm._threads_per_atom))); gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE, ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); @@ -237,11 +236,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(), &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), - &stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(), - &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall); + &stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(), + &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall, + &gbm._threads_per_atom); gbm.time_gayberne.stop(); - if (gbm.last_ellipse==gbm.atom->inum()) { + if (gbm.last_ellipse==gbm.ans->inum()) { gbm.time_kernel2.start(); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); @@ -254,9 +254,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { // ------------ SPHERE_ELLIPSE --------------- gbm.time_kernel2.start(); - GX=static_cast(ceil(static_cast(gbm.atom->inum()- - gbm.last_ellipse)/BX)); - gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(), + GX=static_cast(ceil(static_cast(gbm.ans->inum()- + gbm.last_ellipse)/ + (BX/gbm._threads_per_atom))); + gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(), SPHERE_ELLIPSE,SPHERE_ELLIPSE); gbm.time_kernel2.stop(); @@ -266,13 +267,14 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm.shape.begin(), &gbm.well.begin(), &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), - &gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(), - &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, - &vflag, &gbm.last_ellipse, &ainum, &anall); + &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(), + &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, + &vflag, &gbm.last_ellipse, &ainum, &anall, + &gbm._threads_per_atom); gbm.time_gayberne2.stop(); } else { - gbm.atom->dev_ans.zero(); - gbm.atom->dev_engv.zero(); + gbm.ans->dev_ans.zero(); + gbm.ans->dev_engv.zero(); gbm.time_kernel.stop(); gbm.time_gayberne.start(); gbm.time_gayberne.stop(); @@ -284,29 +286,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { // ------------ LJ --------------- gbm.time_pair.start(); - if (gbm.last_ellipseinum()) { + if (gbm.last_ellipseinum()) { if (gbm.shared_types) { GBMF.k_lj_fast.set_size(GX,BX); GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(), &gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(), &stride, &gbm.nbor->dev_packed.begin(), - &gbm.atom->dev_ans.begin(), - &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), - &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall); + &gbm.ans->dev_ans.begin(), + &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), + &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall, + &gbm._threads_per_atom); } else { GBMF.k_lj.set_size(GX,BX); GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(), &gbm.lj3.begin(), &gbm._lj_types, &gbm.gamma_upsilon_mu.begin(), &stride, - &gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(), - &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), - &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall); + &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(), + &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), + &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall, + &gbm._threads_per_atom); } } gbm.time_pair.stop(); } else { gbm.time_kernel.start(); - gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE, + gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE, ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); gbm.time_gayberne.start(); @@ -315,9 +319,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm.shape.begin(), &gbm.well.begin(), &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), - &stride, &gbm.atom->dev_ans.begin(), &ainum, - &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), - &eflag, &vflag, &ainum, &anall); + &stride, &gbm.ans->dev_ans.begin(), &ainum, + &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), + &eflag, &vflag, &ainum, &anall, &gbm._threads_per_atom); gbm.time_gayberne.stop(); } } @@ -326,30 +330,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { // Reneighbor on GPU if necessary and then compute forces, torques, energies // --------------------------------------------------------------------------- template -inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago, - const int inum_full, const int nall, - double **host_x, int *host_type, - double *boxlo, double *boxhi, const bool eflag, - const bool vflag, const bool eatom, +inline int** _gb_gpu_compute_n(gbmtyp &gbm, const int ago, + const int inum_full, const int nall, + double **host_x, int *host_type, + double *sublo, double *subhi, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, - double **host_quat) { + int **ilist, int **jnum, const double cpu_time, + bool &success, double **host_quat) { gbm.acc_timers(); if (inum_full==0) { + host_start=0; gbm.zero_timers(); return NULL; } - gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor()); - int inum=gbm.hd_balancer.get_gpu_count(timestep,ago,inum_full); - gbm.atom->inum(inum); + gbm.hd_balancer.balance(cpu_time); + int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full); + gbm.ans->inum(inum); gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse); host_start=inum; // Build neighbor list on GPU if necessary if (ago==0) { _gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x, - host_quat, host_type, boxlo, boxhi, success); + host_quat, host_type, sublo, subhi, success); if (!success) return NULL; gbm.atom->cast_quat_data(host_quat[0]); @@ -361,47 +366,49 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago, gbm.atom->add_x_data(host_x,host_type); } - gbm.atom->add_other_data(); + gbm.atom->add_quat_data(); + *ilist=gbm.nbor->host_ilist.begin(); + *jnum=gbm.nbor->host_acc.begin(); _gb_gpu_gayberne(gbm,eflag,vflag); - gbm.atom->copy_answers(eflag,vflag,eatom,vatom); + gbm.ans->copy_answers(eflag,vflag,eatom,vatom); + gbm.device->add_ans_object(gbm.ans); gbm.hd_balancer.stop_timer(); - return gbm.device->nbor.host_nbor.begin(); + return gbm.nbor->host_jlist.begin()-host_start; } -int * gb_gpu_compute_n(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, const double cpu_time, bool &success, - double **host_quat) { - return _gb_gpu_compute_n(GBMF, timestep, ago, inum_full, nall, host_x, - host_type, boxlo, boxhi, eflag, vflag, eatom, vatom, - host_start, cpu_time, success, host_quat); +int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double **host_quat) { + return _gb_gpu_compute_n(GBMF, ago, inum_full, nall, host_x, host_type, sublo, + subhi, eflag, vflag, eatom, vatom, host_start, ilist, + jnum, cpu_time, success, host_quat); } // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then calculate forces, torques,.. // --------------------------------------------------------------------------- template -inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago, - const int inum_full,const int nall,double **host_x, - int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - const double cpu_time, bool &success, - double **host_quat) { +inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full, + const int nall,double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double **host_quat) { gbm.acc_timers(); if (inum_full==0) { + host_start=0; gbm.zero_timers(); return NULL; } int ago=gbm.hd_balancer.ago_first(f_ago); - int inum=gbm.hd_balancer.balance(timestep,ago,inum_full,cpu_time, - gbm.nbor->gpu_nbor()); - gbm.atom->inum(inum); + int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time); + gbm.ans->inum(inum); gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse); host_start=inum; @@ -421,21 +428,21 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago, gbm.atom->cast_quat_data(host_quat[0]); gbm.hd_balancer.start_timer(); gbm.atom->add_x_data(host_x,host_type); - gbm.atom->add_other_data(); + gbm.atom->add_quat_data(); _gb_gpu_gayberne(gbm,eflag,vflag); - gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list); + gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list); + gbm.device->add_ans_object(gbm.ans); gbm.hd_balancer.stop_timer(); return list; } -int * gb_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double **host_quat) { - return _gb_gpu_compute(GBMF, timestep, ago, inum_full, nall, host_x, +int * gb_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double **host_quat) { + return _gb_gpu_compute(GBMF, ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, host_quat); diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h index 6ac390437a..a341940c0a 100644 --- a/lib/gpu/gb_gpu_extra.h +++ b/lib/gpu/gb_gpu_extra.h @@ -18,7 +18,6 @@ #ifndef GB_GPU_EXTRA_H #define GB_GPU_EXTRA_H -#define MAX_SHARED_TYPES 8 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #ifdef _DOUBLE_DOUBLE @@ -47,7 +46,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" #else @@ -58,6 +57,8 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #define BLOCK_SIZE_X get_local_size(0) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) #define __inline inline +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif diff --git a/lib/gpu/gb_gpu_kernel.cu b/lib/gpu/gb_gpu_kernel.cu index b8d06ec6da..7bb320f5d0 100644 --- a/lib/gpu/gb_gpu_kernel.cu +++ b/lib/gpu/gb_gpu_kernel.cu @@ -97,17 +97,17 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q, __global acctyp4 *ans, const int astride, __global acctyp *engv, __global int *err_flag, const int eflag, const int vflag, const int inum, - const int nall) { + const int nall, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; - if (ii<4) - sp_lj[ii]=gum[ii+3]; - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); - __syncthreads(); - - if (ii0) + energy+=u_r*temp2; + numtyp temp1 = -eta*u_r*factor_lj; + if (vflag>0) { + r12[0]*=-r; + r12[1]*=-r; + r12[2]*=-r; + numtyp ft=temp1*dchi[0]-temp2*dUr[0]; + f.x+=ft; + virial[0]+=r12[0]*ft; + ft=temp1*dchi[1]-temp2*dUr[1]; + f.y+=ft; + virial[1]+=r12[1]*ft; + virial[3]+=r12[0]*ft; + ft=temp1*dchi[2]-temp2*dUr[2]; + f.z+=ft; + virial[2]+=r12[2]*ft; + virial[4]+=r12[0]*ft; + virial[5]+=r12[1]*ft; + } else { + f.x+=temp1*dchi[0]-temp2*dUr[0]; + f.y+=temp1*dchi[1]-temp2*dUr[1]; + f.z+=temp1*dchi[2]-temp2*dUr[2]; + } + + // Torque on 1 + temp1 = -u_r*eta*factor_lj; + temp2 = -u_r*chi*factor_lj; + numtyp temp3 = -chi*eta*factor_lj; + tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; + tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; + tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; + + } // for nbor + } // if ii - // energy - - // compute u_r and dUr - numtyp uslj_rsq; - { - // Compute distance of closest approach - numtyp h12, sigma12; - sigma12 = gpu_dot3(r12,kappa); - sigma12 = rsqrt((numtyp)0.5*sigma12); - h12 = r-sigma12; + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[7][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=tor.x; + red_acc[4][tid]=tor.y; + red_acc[5][tid]=tor.z; - // -- kappa is now ok - kappa[0]*=r; - kappa[1]*=r; - kappa[2]*=r; - - int mtype=mul24(ntypes,itype)+jtype; - numtyp sigma = sig_eps[mtype].x; - numtyp epsilon = sig_eps[mtype].y; - numtyp varrho = sigma/(h12+gum[0]*sigma); - numtyp varrho6 = varrho*varrho*varrho; - varrho6*=varrho6; - numtyp varrho12 = varrho6*varrho6; - u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); - - numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; - temp1 = temp1*(numtyp)24.0*epsilon; - uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; - numtyp temp2 = gpu_dot3(kappa,r12); - uslj_rsq = uslj_rsq*ir*ir; - - dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]); - dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]); - dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]); - } - - // torque for particle 1 - { - numtyp tempv[3], tempv2[3]; - tempv[0] = -uslj_rsq*kappa[0]; - tempv[1] = -uslj_rsq*kappa[1]; - tempv[2] = -uslj_rsq*kappa[2]; - gpu_row_times3(kappa,g1,tempv2); - gpu_cross3(tempv,tempv2,tUr); - } + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; } } - - // Compute eta - { - eta = (numtyp)2.0*lshape[itype]*lshape[jtype]; - numtyp det_g12 = gpu_det3(g12); - eta = pow(eta/det_g12,gum[1]); - } - // Compute teta - numtyp temp[9], tempv[3], tempv2[3]; - compute_eta_torque(g12,a1,ishape,temp); - numtyp temp1 = -eta*gum[1]; + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + tor.x=red_acc[3][tid]; + tor.y=red_acc[4][tid]; + tor.z=red_acc[5][tid]; - tempv[0] = temp1*temp[0]; - tempv[1] = temp1*temp[1]; - tempv[2] = temp1*temp[2]; - gpu_cross3(a1,tempv,tempv2); - teta[0] = tempv2[0]; - teta[1] = tempv2[1]; - teta[2] = tempv2[2]; - - tempv[0] = temp1*temp[3]; - tempv[1] = temp1*temp[4]; - tempv[2] = temp1*temp[5]; - gpu_cross3(a1+3,tempv,tempv2); - teta[0] += tempv2[0]; - teta[1] += tempv2[1]; - teta[2] += tempv2[2]; + if (eflag>0 || vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + red_acc[6][tid]=energy; - tempv[0] = temp1*temp[6]; - tempv[1] = temp1*temp[7]; - tempv[2] = temp1*temp[8]; - gpu_cross3(a1+6,tempv,tempv2); - teta[0] += tempv2[0]; - teta[1] += tempv2[1]; - teta[2] += tempv2[2]; - } - - numtyp chi, dchi[3], tchi[3]; - { // Compute chi and dchi - - // Compute b12 - numtyp b2[9], b12[9]; - { - gpu_times3(well[jtype],a2,b12); - gpu_transpose_times3(a2,b12,b2); - gpu_plus3(b1,b2,b12); + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<7; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + energy=red_acc[6][tid]; } - - // compute chi_12 - r12[0]*=r; - r12[1]*=r; - r12[2]*=r; - numtyp iota[3]; - gpu_mldivide3(b12,r12,iota,err_flag); - // -- iota is now iota/r - iota[0]*=ir; - iota[1]*=ir; - iota[2]*=ir; - r12[0]*=ir; - r12[1]*=ir; - r12[2]*=ir; - chi = gpu_dot3(r12,iota); - chi = pow(chi*(numtyp)2.0,gum[2]); - - // -- iota is now ok - iota[0]*=r; - iota[1]*=r; - iota[2]*=r; - - numtyp temp1 = gpu_dot3(iota,r12); - numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/ - gum[2]); - dchi[0] = temp2*(iota[0]-temp1*r12[0]); - dchi[1] = temp2*(iota[1]-temp1*r12[1]); - dchi[2] = temp2*(iota[2]-temp1*r12[2]); - - // compute t_chi - numtyp tempv[3]; - gpu_row_times3(iota,b1,tempv); - gpu_cross3(tempv,iota,tchi); - temp1 = (numtyp)-4.0*ir*ir; - tchi[0] *= temp1; - tchi[1] *= temp1; - tchi[2] *= temp1; } - numtyp temp2 = factor_lj*eta*chi; - if (eflag>0) - energy+=u_r*temp2; - numtyp temp1 = -eta*u_r*factor_lj; - if (vflag>0) { - r12[0]*=-r; - r12[1]*=-r; - r12[2]*=-r; - numtyp ft=temp1*dchi[0]-temp2*dUr[0]; - f.x+=ft; - virial[0]+=r12[0]*ft; - ft=temp1*dchi[1]-temp2*dUr[1]; - f.y+=ft; - virial[1]+=r12[1]*ft; - virial[3]+=r12[0]*ft; - ft=temp1*dchi[2]-temp2*dUr[2]; - f.z+=ft; - virial[2]+=r12[2]*ft; - virial[4]+=r12[0]*ft; - virial[5]+=r12[1]*ft; - } else { - f.x+=temp1*dchi[0]-temp2*dUr[0]; - f.y+=temp1*dchi[1]-temp2*dUr[1]; - f.z+=temp1*dchi[2]-temp2*dUr[2]; - } - - // Torque on 1 - temp1 = -u_r*eta*factor_lj; - temp2 = -u_r*chi*factor_lj; - numtyp temp3 = -chi*eta*factor_lj; - tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; - tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; - tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; - - } // for nbor - // Store answers - __global acctyp *ap1=engv+ii; - if (eflag>0) { - *ap1=energy; - ap1+=astride; - } - if (vflag>0) { - for (int i=0; i<6; i++) { - *ap1=virial[i]; + if (ii0) { + *ap1=energy; ap1+=astride; } - } - ans[ii]=f; - ans[ii+astride]=tor; + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=astride; + } + } + ans[ii]=f; + ans[ii+astride]=tor; } // if ii } diff --git a/lib/gpu/gb_gpu_kernel_lj.cu b/lib/gpu/gb_gpu_kernel_lj.cu index 3e42cbcbbc..657fc20cd5 100644 --- a/lib/gpu/gb_gpu_kernel_lj.cu +++ b/lib/gpu/gb_gpu_kernel_lj.cu @@ -34,33 +34,36 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q, __global acctyp4 *ans, __global acctyp *engv, __global int *err_flag, const int eflag, const int vflag,const int start, const int inum, - const int nall) { - __local numtyp sp_lj[4]; + const int nall, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; - if (ii<4) - sp_lj[ii]=gum[ii+3]; - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start; - __syncthreads(); + __local numtyp sp_lj[4]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -265,39 +307,42 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, __global acctyp4 *ans, __global acctyp *engv, __global int *err_flag, const int eflag, const int vflag, const int start, const int inum, - const int nall) { - __local numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; - if (ii<4) - sp_lj[ii]=gum[ii+3]; - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start; - __syncthreads(); + const int nall, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1+=energy; @@ -361,50 +445,54 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, __global numtyp *gum, - const int stride, - __global int *dev_ij, __global acctyp4 *ans, - __global acctyp *engv, __global int *err_flag, - const int eflag,const int vflag, const int start, - const int inum, const int nall) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int start, const int inum, + const int nall, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - if (ii<4) - sp_lj[ii]=gum[ii+3]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1+=energy; diff --git a/lib/gpu/gb_gpu_kernel_nbor.cu b/lib/gpu/gb_gpu_kernel_nbor.cu index 80da8b8d9d..1b1d81fa42 100644 --- a/lib/gpu/gb_gpu_kernel_nbor.cu +++ b/lib/gpu/gb_gpu_kernel_nbor.cu @@ -18,8 +18,6 @@ #ifndef PAIR_GPU_KERNEL_H #define PAIR_GPU_KERNEL_H -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -32,7 +30,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" #else @@ -42,6 +40,7 @@ #define BLOCK_ID_X get_group_id(0) #define BLOCK_SIZE_X get_local_size(0) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define MAX_SHARED_TYPES 8 #endif diff --git a/lib/gpu/gb_gpu_memory.cpp b/lib/gpu/gb_gpu_memory.cpp index 1d78204031..971649c6e8 100644 --- a/lib/gpu/gb_gpu_memory.cpp +++ b/lib/gpu/gb_gpu_memory.cpp @@ -32,30 +32,35 @@ template GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false), _max_bytes(0.0) { device=&pair_gpu_device; + ans=new PairGPUAns(); + nbor=new PairGPUNbor; } template GB_GPU_MemoryT::~GB_GPU_Memory() { clear(); + delete ans; + delete nbor; } template int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { - return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors); + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); } template -bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, - const double upsilon, const double mu, - double **host_shape, double **host_well, - double **host_cutsq, double **host_sigma, - double **host_epsilon, double *host_lshape, - int **h_form, double **host_lj1, double **host_lj2, - double **host_lj3, double **host_lj4, - double **host_offset, const double *host_special_lj, - const int nlocal, const int nall, - const int max_nbors, const double cell_size, - const double gpu_split, FILE *_screen) { +int GB_GPU_MemoryT::init(const int ntypes, const double gamma, + const double upsilon, const double mu, + double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, + double **host_epsilon, double *host_lshape, + int **h_form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, + double **host_offset, const double *host_special_lj, + const int nlocal, const int nall, + const int max_nbors, const double cell_size, + const double gpu_split, FILE *_screen) { nbor_time_avail=false; screen=_screen; @@ -64,24 +69,24 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, gpu_nbor=true; int _gpu_host=0; - int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split); + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); if (host_nlocal>0) _gpu_host=1; - if (!device->init(false,true,nlocal,host_nlocal,nall,0,gpu_nbor,_gpu_host, - max_nbors,cell_size,true)) - return false; + _threads_per_atom=device->threads_per_atom(); + int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0, + _gpu_host,max_nbors,cell_size,true); + if (success!=0) + return success; + ucl_device=device->gpu; atom=&device->atom; - nbor=&device->nbor; - _block_size=BLOCK_1D; - if (static_cast(_block_size)>ucl_device->group_size()) - _block_size=ucl_device->group_size(); + _block_size=device->pair_block_size(); compile_kernels(*ucl_device); // Initialize host-device load balancer - hd_balancer.init(device,gpu_split); + hd_balancer.init(device,gpu_nbor,gpu_split); // Initialize timers for the selected GPU time_pair.init(*ucl_device); @@ -90,8 +95,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (lj_types<=MAX_SHARED_TYPES && _block_size>=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; + int max_shared_types=device->max_shared_types(); + if (lj_types<=max_shared_types && _block_size>=max_shared_types) { + lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; @@ -186,12 +192,19 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, } if (multiple_forms) - atom->dev_ans.zero(); + ans->dev_ans.zero(); - _max_bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + _max_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); // Memory for ilist ordered by particle type - return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS); + if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS) + return 0; + else return -3; +} + +template +void GB_GPU_MemoryT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead); } template @@ -209,9 +222,9 @@ void GB_GPU_MemoryT::clear() { // Output any timing information acc_timers(); - double single[6], times[6]; + double single[9], times[9]; - single[0]=atom->transfer_time(); + single[0]=atom->transfer_time()+ans->transfer_time(); single[1]=nbor->time_nbor.total_seconds(); single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+ nbor->time_kernel.total_seconds(); @@ -220,15 +233,18 @@ void GB_GPU_MemoryT::clear() { single[4]=time_pair.total_seconds(); else single[4]=0; - single[5]=atom->cast_time(); + single[5]=atom->cast_time()+ans->cast_time(); + single[6]=_gpu_overhead; + single[7]=_driver_overhead; + single[8]=ans->cpu_idle_time(); - MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica()); + MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica()); double avg_split=hd_balancer.all_avg_split(); _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+ sigma_epsilon.row_bytes()+cut_form.row_bytes()+ shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+ - gamma_upsilon_mu.row_bytes(); + gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0, device->replica()); @@ -255,10 +271,19 @@ void GB_GPU_MemoryT::clear() { fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size); fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size); } + fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size); + fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); + + + fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + + } _max_bytes=0.0; @@ -299,10 +324,9 @@ void GB_GPU_MemoryT::clear() { template double GB_GPU_MemoryT::host_memory_usage() const { - return device->atom.host_memory_usage()+ - device->nbor.host_memory_usage()+4*sizeof(numtyp)+ - sizeof(GB_GPU_Memory)+ - device->nbor.max_atoms()*sizeof(int); + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(GB_GPU_Memory)+ + nbor->max_atoms()*sizeof(int); } template diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h index 2cfc805cd8..40ed8bec51 100644 --- a/lib/gpu/gb_gpu_memory.h +++ b/lib/gpu/gb_gpu_memory.h @@ -18,8 +18,6 @@ #ifndef GB_GPU_MEMORY_H #define GB_GPU_MEMORY_H -#define BLOCK_1D 64 - #include "pair_gpu_device.h" #include "pair_gpu_balance.h" #include "mpi.h" @@ -35,23 +33,34 @@ class GB_GPU_Memory { * \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * \return false if there is not sufficient memory or device init prob **/ - bool init(const int ntypes, const double gamma, - const double upsilon, const double mu, double **host_shape, - double **host_well, double **host_cutsq, double **host_sigma, - double **host_epsilon, double *host_lshape, int **h_form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - const double *host_special_lj, const int nlocal, const int nall, - const int max_nbors, const double cell_size, - const double gpu_split, FILE *screen); + * \return false if there is not sufficient memory or device init prob + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, const double gamma, + const double upsilon, const double mu, double **host_shape, + double **host_well, double **host_cutsq, double **host_sigma, + double **host_epsilon, double *host_lshape, int **h_form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + const double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const double cell_size, + const double gpu_split, FILE *screen); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ inline void resize_atom(const int inum, const int nall, bool &success) { - atom->resize(inum, nall, success); - if (multiple_forms) atom->dev_ans.zero(); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + atom->resize(nall, success); + ans->resize(inum, success); + if (multiple_forms) ans->dev_ans.zero(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_bytes) _max_bytes=bytes; } @@ -74,7 +83,7 @@ class GB_GPU_Memory { success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS); } nbor->resize(nlocal,host_inum,max_nbors,success); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_bytes) _max_bytes=bytes; } @@ -91,19 +100,22 @@ class GB_GPU_Memory { /// Accumulate timers inline void acc_timers() { - if (nbor_time_avail) { - nbor->time_nbor.add_to_total(); - nbor->time_kernel.add_to_total(); - nbor_time_avail=false; + if (device->time_device()) { + if (nbor_time_avail) { + nbor->time_nbor.add_to_total(); + nbor->time_kernel.add_to_total(); + nbor_time_avail=false; + } + time_kernel.add_to_total(); + time_gayberne.add_to_total(); + if (multiple_forms) { + time_kernel2.add_to_total(); + time_gayberne2.add_to_total(); + time_pair.add_to_total(); + } + atom->acc_timers(); + ans->acc_timers(); } - time_kernel.add_to_total(); - time_gayberne.add_to_total(); - if (multiple_forms) { - time_kernel2.add_to_total(); - time_gayberne2.add_to_total(); - time_pair.add_to_total(); - } - atom->acc_timers(); } /// Accumulate timers @@ -117,6 +129,7 @@ class GB_GPU_Memory { time_pair.zero(); } atom->zero_timers(); + ans->zero_timers(); } // -------------------------- DEVICE DATA ------------------------- @@ -168,6 +181,10 @@ class GB_GPU_Memory { int last_ellipse, max_last_ellipse; + // ------------------------ FORCE/ENERGY DATA ----------------------- + + PairGPUAns *ans; + // --------------------------- NBOR DATA ---------------------------- /// Neighbor data @@ -183,10 +200,12 @@ class GB_GPU_Memory { UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj; inline int block_size() { return _block_size; } + int _threads_per_atom; private: bool _allocated, _compiled; int _block_size; double _max_bytes; + double _gpu_overhead, _driver_overhead; void compile_kernels(UCL_Device &dev); }; diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt index 77e0a073c7..d260cab24e 100644 --- a/lib/gpu/geryon/VERSION.txt +++ b/lib/gpu/geryon/VERSION.txt @@ -1,2 +1,2 @@ -Geryon Version 10.280 - \ No newline at end of file +Geryon Version 11.094 + diff --git a/lib/gpu/geryon/nvc_device.h b/lib/gpu/geryon/nvc_device.h index ed445716f6..6a232986ff 100644 --- a/lib/gpu/geryon/nvc_device.h +++ b/lib/gpu/geryon/nvc_device.h @@ -167,6 +167,7 @@ class UCL_Device { int _device, _num_devices; std::vector _properties; std::vector _cq; + std::vector _device_ids; }; // Grabs the properties for all devices @@ -178,6 +179,7 @@ inline UCL_Device::UCL_Device() { if (deviceProp.major == 9999 && deviceProp.minor == 9999) break; _properties.push_back(deviceProp); + _device_ids.push_back(dev); } _device=-1; _cq.push_back(cudaStream_t()); @@ -194,7 +196,7 @@ inline void UCL_Device::set(int num) { return; for (int i=1; i + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28, t29 *a29) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); + } + // --------------------------------------------------------------------------- @@ -439,6 +624,211 @@ run(); } + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28, t29 *a29) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); + run(); + } + // --------------------------------------------------------------------------- template @@ -671,3 +1061,208 @@ run(cq); } + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28, t29 *a29) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); + run(cq); + } + diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h index c0531b2f29..11ec58629a 100644 --- a/lib/gpu/geryon/ucl_d_mat.h +++ b/lib/gpu/geryon/ucl_d_mat.h @@ -13,7 +13,7 @@ copyright : (C) 2009 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ - + /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -61,20 +61,23 @@ class UCL_D_Mat : public UCL_BaseMat { inline int alloc(const size_t rows, const size_t cols, mat_type &cq, const enum UCL_MEMOPT kind=UCL_READ_WRITE) { clear(); - _kind=kind; - _rows=rows; - _cols=cols; + int err=_device_alloc(*this,cq,rows,cols,_pitch,kind); - _row_size=_pitch/sizeof(numtyp); - #ifndef _UCL_DEVICE_PTR_MAT - _end=_array+_row_size*cols; - #endif - #ifndef UCL_NO_EXIT if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << rows*cols*sizeof(numtyp) << " bytes on device.\n"; exit(1); + #endif + return err; } + + _kind=kind; + _rows=rows; + _cols=cols; + _row_size=_pitch/sizeof(numtyp); + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_row_size*cols; #endif #ifdef _OCL_MAT _offset=0; @@ -94,20 +97,23 @@ class UCL_D_Mat : public UCL_BaseMat { inline int alloc(const size_t rows, const size_t cols, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_READ_WRITE) { clear(); - _kind=kind; - _rows=rows; - _cols=cols; + int err=_device_alloc(*this,device,rows,cols,_pitch,kind); - _row_size=_pitch/sizeof(numtyp); - #ifndef _UCL_DEVICE_PTR_MAT - _end=_array+_row_size*cols; - #endif - #ifndef UCL_NO_EXIT if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << rows*cols*sizeof(numtyp) << " bytes on device.\n"; exit(1); + #endif + return err; } + + _kind=kind; + _rows=rows; + _cols=cols; + _row_size=_pitch/sizeof(numtyp); + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_row_size*cols; #endif #ifdef _OCL_MAT _offset=0; diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h index 45c94bee82..0be063c940 100644 --- a/lib/gpu/geryon/ucl_d_vec.h +++ b/lib/gpu/geryon/ucl_d_vec.h @@ -13,7 +13,7 @@ copyright : (C) 2009 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ - + /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -60,19 +60,24 @@ class UCL_D_Vec : public UCL_BaseMat { const enum UCL_MEMOPT kind=UCL_READ_WRITE) { clear(); - _kind=kind; - _cols=cols; + _row_bytes=cols*sizeof(numtyp); int err=_device_alloc(*this,cq,_row_bytes,kind); - #ifndef _UCL_DEVICE_PTR_MAT - _end=_array+cols; - #endif - #ifndef UCL_NO_EXIT if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes << " bytes on device.\n"; + _row_bytes=0; exit(1); + #endif + _row_bytes=0; + return err; } + + _kind=kind; + _cols=cols; + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+cols; #endif #ifdef _OCL_MAT _offset=0; @@ -90,19 +95,23 @@ class UCL_D_Vec : public UCL_BaseMat { inline int alloc(const size_t cols, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_READ_WRITE) { clear(); - _kind=kind; - _cols=cols; _row_bytes=cols*sizeof(numtyp); int err=_device_alloc(*this,device,_row_bytes,kind); - #ifndef _UCL_DEVICE_PTR_MAT - _end=_array+cols; - #endif - #ifndef UCL_NO_EXIT if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes << " bytes on device.\n"; + _row_bytes=0; exit(1); + #endif + _row_bytes=0; + return err; } + + _kind=kind; + _cols=cols; + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+cols; #endif #ifdef _OCL_MAT _offset=0; diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h index 51593cfa23..762bb03131 100644 --- a/lib/gpu/geryon/ucl_h_mat.h +++ b/lib/gpu/geryon/ucl_h_mat.h @@ -13,7 +13,7 @@ copyright : (C) 2009 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ - + /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -39,7 +39,11 @@ class UCL_H_Mat : public UCL_BaseMat { }; typedef numtyp data_type; - UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { } + UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { + #ifdef _OCL_MAT + _carray=(cl_mem)(0); + #endif + } ~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); } /// Construct with specied number of rows and columns @@ -59,18 +63,23 @@ class UCL_H_Mat : public UCL_BaseMat { inline int alloc(const size_t rows, const size_t cols, mat_type &cq, const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { clear(); - _cols=cols; - _rows=rows; + _row_bytes=cols*sizeof(numtyp); - _kind=kind; - int err=_host_alloc(*this,cq,_row_bytes*_rows,kind); - #ifndef UCL_NO_EXIT + int err=_host_alloc(*this,cq,_row_bytes*rows,kind); if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows << " bytes on host.\n"; + _row_bytes=0; exit(1); + #endif + _row_bytes=0; + return err; } - #endif + + _cols=cols; + _rows=rows; + _kind=kind; _end=_array+rows*cols; return err; } @@ -85,19 +94,24 @@ class UCL_H_Mat : public UCL_BaseMat { inline int alloc(const size_t rows, const size_t cols, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { clear(); - _cols=cols; - _rows=rows; + _row_bytes=cols*sizeof(numtyp); - _kind=kind; - int err=_host_alloc(*this,device,_row_bytes*_rows,kind); - _end=_array+rows*cols; - #ifndef UCL_NO_EXIT + int err=_host_alloc(*this,device,_row_bytes*rows,kind); if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows << " bytes on host.\n"; + _row_bytes=0; exit(1); + #endif + _row_bytes=0; + return err; } - #endif + + _cols=cols; + _rows=rows; + _kind=kind; + _end=_array+rows*cols; return err; } diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h index ca1dd12a47..4af1e2179f 100644 --- a/lib/gpu/geryon/ucl_h_vec.h +++ b/lib/gpu/geryon/ucl_h_vec.h @@ -13,7 +13,7 @@ copyright : (C) 2009 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ - + /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -39,7 +39,11 @@ class UCL_H_Vec : public UCL_BaseMat { }; typedef numtyp data_type; - UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { } + UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { + #ifdef _OCL_MAT + _carray=(cl_mem)(0); + #endif + } ~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); } /// Construct with n columns @@ -59,18 +63,24 @@ class UCL_H_Vec : public UCL_BaseMat { inline int alloc(const size_t cols, mat_type &cq, const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { clear(); - _cols=cols; + _row_bytes=cols*sizeof(numtyp); - _kind=kind; int err=_host_alloc(*this,cq,_row_bytes,kind); - _end=_array+cols; - #ifndef UCL_NO_EXIT + if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes << " bytes on host.\n"; + _row_bytes=0; exit(1); + #endif + _row_bytes=0; + return err; } - #endif + + _cols=cols; + _kind=kind; + _end=_array+cols; return err; } @@ -84,18 +94,24 @@ class UCL_H_Vec : public UCL_BaseMat { inline int alloc(const size_t cols, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { clear(); - _cols=cols; + _row_bytes=cols*sizeof(numtyp); - _kind=kind; int err=_host_alloc(*this,device,_row_bytes,kind); - _end=_array+cols; - #ifndef UCL_NO_EXIT + if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes << " bytes on host.\n"; + _row_bytes=0; exit(1); + #endif + _row_bytes=0; + return err; } - #endif + + _cols=cols; + _kind=kind; + _end=_array+cols; return err; } diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h index 1ea9175e3a..5c45dc3a87 100644 --- a/lib/gpu/geryon/ucl_nv_kernel.h +++ b/lib/gpu/geryon/ucl_nv_kernel.h @@ -13,7 +13,7 @@ copyright : (C) 2010 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ - + /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -25,8 +25,18 @@ #ifndef UCL_NV_KERNEL_H #define UCL_NV_KERNEL_H -#define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x) -#define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y) +#if (__CUDA_ARCH__ < 200) +#define mul24 __mul24 +#define MEM_THREADS 16 +#else +#define mul24(X,Y) (X)*(Y) +#define MEM_THREADS 32 +#endif + +#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x) +#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y) +#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x); +#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y); #define THREAD_ID_X threadIdx.x #define THREAD_ID_Y threadIdx.y #define BLOCK_ID_X blockIdx.x @@ -35,8 +45,9 @@ #define BLOCK_SIZE_Y blockDim.y #define __kernel extern "C" __global__ #define __local __shared__ -#define mul24 __mul24 #define __global #define __inline static __inline__ __device__ +#define atom_add atomicAdd #endif + diff --git a/lib/gpu/lj96_cut_gpu.cpp b/lib/gpu/lj96_cut_gpu.cpp index 24fb5d8570..df83afd521 100644 --- a/lib/gpu/lj96_cut_gpu.cpp +++ b/lib/gpu/lj96_cut_gpu.cpp @@ -28,11 +28,11 @@ static LJ96_GPU_Memory LJ96MF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen) { +int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { LJ96MF.clear(); gpu_mode=LJ96MF.device->gpu_mode(); double gpu_split=LJ96MF.device->particle_split(); @@ -53,13 +53,11 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, fflush(screen); } - if (world_me==0) { - bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); LJ96MF.device->world_barrier(); if (message) @@ -74,46 +72,46 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, - nall, 300, maxspecial, cell_size, gpu_split, - screen); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen); + LJ96MF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + LJ96MF.estimate_gpu_overhead(); + return init_ok; } void lj96_gpu_clear() { LJ96MF.clear(); } -int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int** lj96_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success) { - return LJ96MF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success); + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); } -void lj96_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success) { - LJ96MF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, - firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +void lj96_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, + eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double lj96_gpu_bytes() { diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu index 0d3a01fbac..3fc6a2f308 100644 --- a/lib/gpu/lj96_cut_gpu_kernel.cu +++ b/lib/gpu/lj96_cut_gpu_kernel.cu @@ -18,8 +18,6 @@ #ifndef LJ96_GPU_KERNEL #define LJ96_GPU_KERNEL -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -46,7 +44,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; #ifdef _DOUBLE_DOUBLE @@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define __inline inline #define fetch_pos(i,y) x_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif @@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -176,49 +230,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const int nall, const int nbor_pitch, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; - if (ii<4) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/lj96_cut_gpu_memory.cpp b/lib/gpu/lj96_cut_gpu_memory.cpp index d365d71044..0b066c0973 100644 --- a/lib/gpu/lj96_cut_gpu_memory.cpp +++ b/lib/gpu/lj96_cut_gpu_memory.cpp @@ -42,7 +42,7 @@ int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool LJ96_GPU_MemoryT::init(const int ntypes, +int LJ96_GPU_MemoryT::init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, @@ -50,14 +50,18 @@ bool LJ96_GPU_MemoryT::init(const int ntypes, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj96_cut_gpu_kernel); + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj96_cut_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; @@ -84,7 +88,7 @@ bool LJ96_GPU_MemoryT::init(const int ntypes, _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); - return true; + return 0; } template @@ -122,9 +126,10 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -133,16 +138,18 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, - &ainum, &anall, &nbor_pitch); + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &anall, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, - &anall, &nbor_pitch); + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &anall, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lj96_cut_gpu_memory.h b/lib/gpu/lj96_cut_gpu_memory.h index 483ef05570..fe0a0b1665 100644 --- a/lib/gpu/lj96_cut_gpu_memory.h +++ b/lib/gpu/lj96_cut_gpu_memory.h @@ -29,13 +29,20 @@ class LJ96_GPU_Memory : public AtomicGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/lj_cut_gpu.cpp b/lib/gpu/lj_cut_gpu.cpp index 12fab2f9f1..aef085f7c9 100644 --- a/lib/gpu/lj_cut_gpu.cpp +++ b/lib/gpu/lj_cut_gpu.cpp @@ -28,12 +28,11 @@ static LJL_GPU_Memory LJLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool ljl_gpu_init(const int ntypes, double **cutsq, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen) { +int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { LJLMF.clear(); gpu_mode=LJLMF.device->gpu_mode(); double gpu_split=LJLMF.device->particle_split(); @@ -54,13 +53,11 @@ bool ljl_gpu_init(const int ntypes, double **cutsq, fflush(screen); } - if (world_me==0) { - bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); LJLMF.device->world_barrier(); if (message) @@ -75,45 +72,45 @@ bool ljl_gpu_init(const int ntypes, double **cutsq, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, - screen); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen); + LJLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + LJLMF.estimate_gpu_overhead(); + return init_ok; } void ljl_gpu_clear() { LJLMF.clear(); } -int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int ** ljl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success) { - return LJLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success); + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); } -void ljl_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success) { - LJLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, +void ljl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu index 0e72e41f36..75f36446f7 100644 --- a/lib/gpu/lj_cut_gpu_kernel.cu +++ b/lib/gpu/lj_cut_gpu_kernel.cu @@ -18,8 +18,6 @@ #ifndef LJ_GPU_KERNEL #define LJ_GPU_KERNEL -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -46,7 +44,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; #ifdef _DOUBLE_DOUBLE @@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define __inline inline #define fetch_pos(i,y) x_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif @@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -175,49 +229,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const int nall, const int nbor_pitch, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; - if (ii<4) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/lj_cut_gpu_memory.cpp b/lib/gpu/lj_cut_gpu_memory.cpp index 23b2fcf6d0..a294eb647f 100644 --- a/lib/gpu/lj_cut_gpu_memory.cpp +++ b/lib/gpu/lj_cut_gpu_memory.cpp @@ -42,22 +42,26 @@ int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool LJL_GPU_MemoryT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj_cut_gpu_kernel); +int LJL_GPU_MemoryT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_cut_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; @@ -84,7 +88,7 @@ bool LJL_GPU_MemoryT::init(const int ntypes, _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); - return true; + return 0; } template @@ -122,9 +126,10 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -133,16 +138,18 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, - &ainum, &anall, &nbor_pitch); + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &anall, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, - &anall, &nbor_pitch); + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &anall, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lj_cut_gpu_memory.h b/lib/gpu/lj_cut_gpu_memory.h index 123b739649..4b86b133a1 100644 --- a/lib/gpu/lj_cut_gpu_memory.h +++ b/lib/gpu/lj_cut_gpu_memory.h @@ -29,13 +29,20 @@ class LJL_GPU_Memory : public AtomicGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double **host_cutsq, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/ljc_cut_gpu.cpp b/lib/gpu/ljc_cut_gpu.cpp index 955a25adce..de6f4f3e62 100644 --- a/lib/gpu/ljc_cut_gpu.cpp +++ b/lib/gpu/ljc_cut_gpu.cpp @@ -28,13 +28,13 @@ static LJC_GPU_Memory LJCMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double **host_cut_coulsq, - double *host_special_coul, const double qqrd2e) { +int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { LJCMF.clear(); gpu_mode=LJCMF.device->gpu_mode(); double gpu_split=LJCMF.device->particle_split(); @@ -55,15 +55,12 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, fflush(screen); } - if (world_me==0) { - bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen, - host_cut_ljsq, host_cut_coulsq, host_special_coul, - qqrd2e); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e); LJCMF.device->world_barrier(); if (message) @@ -78,48 +75,51 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, - screen, host_cut_ljsq, host_cut_coulsq, - host_special_coul, qqrd2e); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e); + LJCMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + LJCMF.estimate_gpu_overhead(); + return init_ok; } void ljc_gpu_clear() { LJCMF.clear(); } -int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int** ljc_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q) { - return LJCMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success, host_q); + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); } -void ljc_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q) { - LJCMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, - firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, - host_q); +void ljc_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, + vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); } double ljc_gpu_bytes() { diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu index 2751e20702..44a607588a 100644 --- a/lib/gpu/ljc_cut_gpu_kernel.cu +++ b/lib/gpu/ljc_cut_gpu_kernel.cu @@ -18,8 +18,6 @@ #ifndef LJC_GPU_KERNEL #define LJC_GPU_KERNEL -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -46,7 +44,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; texture q_tex; @@ -82,6 +80,8 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif @@ -92,13 +92,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_ , __global numtyp *cutsq, - const numtyp qqrd2e) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, __global numtyp *q_ , + __global numtyp *cutsq, const numtyp qqrd2e, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -109,29 +113,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -209,54 +266,69 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_ , __global numtyp *_cutsq, - const numtyp qqrd2e) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const numtyp qqrd2e, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/ljc_cut_gpu_memory.cpp b/lib/gpu/ljc_cut_gpu_memory.cpp index d63ed6e5d9..642ff6ecc7 100644 --- a/lib/gpu/ljc_cut_gpu_memory.cpp +++ b/lib/gpu/ljc_cut_gpu_memory.cpp @@ -43,24 +43,28 @@ int LJC_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool LJC_GPU_MemoryT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen, - double **host_cut_ljsq, double **host_cut_coulsq, - double *host_special_coul, const double qqrd2e) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,ljc_cut_gpu_kernel); +int LJC_GPU_MemoryT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,ljc_cut_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; @@ -95,7 +99,7 @@ bool LJC_GPU_MemoryT::init(const int ntypes, _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+ sp_lj.row_bytes(); - return true; + return 0; } template @@ -134,9 +138,10 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -145,19 +150,20 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &cutsq.begin(), - &_qqrd2e); + &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), - &cutsq.begin(), &_qqrd2e); + &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/ljc_cut_gpu_memory.h b/lib/gpu/ljc_cut_gpu_memory.h index 4dedce957a..552f9d9881 100644 --- a/lib/gpu/ljc_cut_gpu_memory.h +++ b/lib/gpu/ljc_cut_gpu_memory.h @@ -29,15 +29,22 @@ class LJC_GPU_Memory : public ChargeGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, double **host_cut_ljsq, - double **host_cut_coulsq, double *host_special_coul, - const double qqrd2e); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, + const double qqrd2e); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/ljcl_cut_gpu.cpp b/lib/gpu/ljcl_cut_gpu.cpp index 8fa15998bf..167f41b374 100644 --- a/lib/gpu/ljcl_cut_gpu.cpp +++ b/lib/gpu/ljcl_cut_gpu.cpp @@ -28,14 +28,14 @@ static LJCL_GPU_Memory LJCLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald) { +int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { LJCLMF.clear(); gpu_mode=LJCLMF.device->gpu_mode(); double gpu_split=LJCLMF.device->particle_split(); @@ -56,15 +56,12 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, fflush(screen); } - if (world_me==0) { - bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen, - host_cut_ljsq, host_cut_coulsq, host_special_coul, - qqrd2e,g_ewald); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); LJCLMF.device->world_barrier(); if (message) @@ -79,48 +76,51 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, - screen, host_cut_ljsq, host_cut_coulsq, - host_special_coul, qqrd2e, g_ewald); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); + LJCLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + LJCLMF.estimate_gpu_overhead(); + return init_ok; } void ljcl_gpu_clear() { LJCLMF.clear(); } -int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int** ljcl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q) { - return LJCLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success, host_q); + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); } -void ljcl_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q) { - LJCLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, +void ljcl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, - host_q); + host_q,nlocal,boxlo,prd); } double ljcl_gpu_bytes() { diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu index a0b27f0259..7be7a86114 100644 --- a/lib/gpu/ljcl_cut_gpu_kernel.cu +++ b/lib/gpu/ljcl_cut_gpu_kernel.cu @@ -18,8 +18,6 @@ #ifndef LJCL_GPU_KERNEL #define LJCL_GPU_KERNEL -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -54,7 +52,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; texture q_tex; @@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif @@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -117,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -225,52 +282,68 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const numtyp qqrd2e, const numtyp g_ewald, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/ljcl_cut_gpu_memory.cpp b/lib/gpu/ljcl_cut_gpu_memory.cpp index a126309a92..f37e6b1857 100644 --- a/lib/gpu/ljcl_cut_gpu_memory.cpp +++ b/lib/gpu/ljcl_cut_gpu_memory.cpp @@ -43,7 +43,7 @@ int LJCL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool LJCL_GPU_MemoryT::init(const int ntypes, +int LJCL_GPU_MemoryT::init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, @@ -54,14 +54,18 @@ bool LJCL_GPU_MemoryT::init(const int ntypes, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,ljcl_cut_gpu_kernel); + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,ljcl_cut_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; @@ -94,7 +98,7 @@ bool LJCL_GPU_MemoryT::init(const int ntypes, _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); - return true; + return 0; } template @@ -132,9 +136,10 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -143,19 +148,21 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, - &_qqrd2e, &_g_ewald); + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), - &_cut_coulsq, &_qqrd2e, &_g_ewald); + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/ljcl_cut_gpu_memory.h b/lib/gpu/ljcl_cut_gpu_memory.h index 056ba0e41f..fae4c07040 100644 --- a/lib/gpu/ljcl_cut_gpu_memory.h +++ b/lib/gpu/ljcl_cut_gpu_memory.h @@ -29,15 +29,22 @@ class LJCL_GPU_Memory : public ChargeGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double **host_cutsq, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, double **host_cut_ljsq, - const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp index 0ca2345087..e34a15c0b9 100644 --- a/lib/gpu/pair_gpu_atom.cpp +++ b/lib/gpu/pair_gpu_atom.cpp @@ -29,9 +29,8 @@ __win_sort _win_sort; #endif template -PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false), - _vflag(false),_inum(0),_ilist(NULL), - _newton(false) { +PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false), + _max_gpu_bytes(0) { #ifndef USE_OPENCL sort_config.op = CUDPP_ADD; sort_config.datatype = CUDPP_UINT; @@ -56,28 +55,20 @@ int PairGPUAtomT::bytes_per_atom() const { int id_space=0; if (_gpu_nbor) id_space=2; - int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space; + int bytes=4*sizeof(numtyp)+id_space; if (_rot) - bytes+=4*sizeof(numtyp)+4*sizeof(acctyp); + bytes+=4*sizeof(numtyp); if (_charge) bytes+=sizeof(numtyp); return bytes; } template -bool PairGPUAtomT::alloc(const int inum, const int nall) { +bool PairGPUAtomT::alloc(const int nall) { _max_atoms=static_cast(static_cast(nall)*1.10); - if (_newton) - _max_local=_max_atoms; - else - _max_local=static_cast(static_cast(inum)*1.10); bool success=true; - int ans_elements=4; - if (_rot) - ans_elements+=4; - // Ignore host/device transfers? bool cpuview=false; if (dev->device_type()==UCL_CPU) @@ -107,8 +98,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) { success=success && (host_x.alloc(_max_atoms*4,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); #endif - success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS); - success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS); // Buffer for casting only if different precisions if (_charge) success=success && (host_q.alloc(_max_atoms,*dev, @@ -120,15 +109,13 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) { // --------------------------- Device allocations - _gpu_bytes=0; + int gpu_bytes=0; if (cpuview) { #ifdef GPU_CAST assert(0==1); #else dev_x.view(host_x); #endif - dev_engv.view(host_engv); - dev_ans.view(host_ans); if (_rot) dev_quat.view(host_quat); if (_charge) @@ -140,49 +127,80 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) { dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)); success=success && (UCL_SUCCESS== dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)); - _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); + gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); #else success=success && (UCL_SUCCESS== dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY)); #endif - success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev, - UCL_WRITE_ONLY)==UCL_SUCCESS); - success=success && (dev_ans.alloc(ans_elements*_max_local, - *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); if (_charge) { success=success && (dev_q.alloc(_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); - _gpu_bytes+=dev_q.row_bytes(); + gpu_bytes+=dev_q.row_bytes(); } if (_rot) { success=success && (dev_quat.alloc(_max_atoms*4,*dev, UCL_READ_ONLY)==UCL_SUCCESS); - _gpu_bytes+=dev_quat.row_bytes(); + gpu_bytes+=dev_quat.row_bytes(); } } if (_gpu_nbor) { success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); - _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes(); + gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes(); if (_bonds) { success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); - _gpu_bytes+=dev_tag.row_bytes(); + gpu_bytes+=dev_tag.row_bytes(); } } - _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes(); + gpu_bytes+=dev_x.row_bytes(); + if (gpu_bytes>_max_gpu_bytes) + _max_gpu_bytes=gpu_bytes; _allocated=true; return success; } template -bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, - const bool rot, UCL_Device &devi, const bool gpu_nbor, +bool PairGPUAtomT::add_fields(const bool charge, const bool rot, + const bool gpu_nbor, const bool bonds) { + bool realloc=false; + if (charge && _charge==false) { + _charge=true; + realloc=true; + } + if (rot && _rot==false) { + _rot=true; + realloc=true; + } + if (gpu_nbor && _gpu_nbor==false) { + _gpu_nbor=true; + realloc=true; + } + if (bonds && _bonds==false) { + _bonds=true; + realloc=true; + } + if (realloc) { + _other=_charge || _rot; + int max_atoms=_max_atoms; + clear_resize(); + return alloc(max_atoms); + } + return true; +} + +template +bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot, + UCL_Device &devi, const bool gpu_nbor, const bool bonds) { clear(); bool success=true; + _x_avail=false; + _q_avail=false; + _quat_avail=false; + _resized=false; _gpu_nbor=gpu_nbor; _bonds=bonds; _charge=charge; @@ -190,33 +208,25 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, _other=_charge || _rot; dev=&devi; - _e_fields=1; - if (_charge) - _e_fields++; - _ev_fields=6+_e_fields; - // Initialize atom and nbor data - int ef_inum=inum; - if (ef_inum==0) - ef_inum=1000; int ef_nall=nall; - if (ef_nall<=ef_inum) - ef_nall=ef_inum*2; + if (ef_nall==0) + ef_nall=2000; // Initialize timers for the selected device time_pos.init(*dev); - time_other.init(*dev); - time_answer.init(*dev); + time_q.init(*dev); + time_quat.init(*dev); time_pos.zero(); - time_other.zero(); - time_answer.zero(); + time_q.zero(); + time_quat.zero(); _time_cast=0.0; #ifdef GPU_CAST compile_kernels(*dev); #endif - return success && alloc(ef_inum,ef_nall); + return success && alloc(ef_nall); } template @@ -234,16 +244,12 @@ void PairGPUAtomT::clear_resize() { dev_quat.clear(); host_quat.clear(); } - dev_ans.clear(); - dev_engv.clear(); #ifndef GPU_CAST host_x.clear(); #else host_x_cast.clear(); host_type_cast.clear(); #endif - host_ans.clear(); - host_engv.clear(); dev_cell_id.clear(); dev_particle_id.clear(); dev_tag.clear(); @@ -261,17 +267,14 @@ void PairGPUAtomT::clear_resize() { template void PairGPUAtomT::clear() { - _gpu_bytes=0; + _max_gpu_bytes=0; if (!_allocated) return; time_pos.clear(); - time_other.clear(); - time_answer.clear(); + time_q.clear(); + time_quat.clear(); clear_resize(); - _inum=0; - _eflag=false; - _vflag=false; #ifdef GPU_CAST if (_compiled) { @@ -289,255 +292,10 @@ double PairGPUAtomT::host_memory_usage() const { atom_bytes+=1; if (_rot) atom_bytes+=4; - int ans_bytes=atom_bytes+_ev_fields; return _max_atoms*atom_bytes*sizeof(numtyp)+ - ans_bytes*(_max_local)*sizeof(acctyp)+ sizeof(PairGPUAtom); } -template -void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom) { - time_answer.start(); - _eflag=eflag; - _vflag=vflag; - _ef_atom=ef_atom; - _vf_atom=vf_atom; - - int csize=_ev_fields; - if (!eflag) - csize-=_e_fields; - if (!vflag) - csize-=6; - - if (csize>0) - ucl_copy(host_engv,dev_engv,_inum*csize,true); - if (_rot) - ucl_copy(host_ans,dev_ans,_inum*4*2,true); - else - ucl_copy(host_ans,dev_ans,_inum*4,true); - time_answer.stop(); -} - -template -void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom, - int *ilist) { - _ilist=ilist; - copy_answers(eflag,vflag,ef_atom,vf_atom); -} - -template -double PairGPUAtomT::energy_virial(double *eatom, double **vatom, - double *virial) { - if (_eflag==false && _vflag==false) - return 0.0; - - double evdwl=0.0; - if (_gpu_nbor) { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[i][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } else { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - int ii=_ilist[i]; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[ii][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } - - evdwl*=0.5; - return evdwl; -} - -template -double PairGPUAtomT::energy_virial(double *eatom, double **vatom, - double *virial, double &ecoul) { - if (_eflag==false && _vflag==false) { - ecoul=0.0; - return 0.0; - } - - if (_charge==false) - return energy_virial(eatom,vatom,virial); - - double evdwl=0.0; - double _ecoul=0.0; - if (_gpu_nbor) { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - _ecoul+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - _ecoul+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[i][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } else { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - int ii=_ilist[i]; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - _ecoul+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - _ecoul+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[ii][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } - - evdwl*=0.5; - ecoul+=_ecoul*0.5; - return evdwl; -} - -template -void PairGPUAtomT::get_answers(double **f, double **tor) { - acctyp *ap=host_ans.begin(); - if (_gpu_nbor) { - for (int i=0; i<_inum; i++) { - f[i][0]+=*ap; - ap++; - f[i][1]+=*ap; - ap++; - f[i][2]+=*ap; - ap+=2; - } - if (_rot) { - for (int i=0; i<_inum; i++) { - tor[i][0]+=*ap; - ap++; - tor[i][1]+=*ap; - ap++; - tor[i][2]+=*ap; - ap+=2; - } - } - } else { - for (int i=0; i<_inum; i++) { - int ii=_ilist[i]; - f[ii][0]+=*ap; - ap++; - f[ii][1]+=*ap; - ap++; - f[ii][2]+=*ap; - ap+=2; - } - if (_rot) { - for (int i=0; i<_inum; i++) { - int ii=_ilist[i]; - tor[ii][0]+=*ap; - ap++; - tor[ii][1]+=*ap; - ap++; - tor[ii][2]+=*ap; - ap+=2; - } - } - } -} - // Sort arrays for neighbor list calculation template void PairGPUAtomT::sort_neighbor(const int num_atoms) { diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h index e0a1fd9fb1..526c146f37 100644 --- a/lib/gpu/pair_gpu_atom.h +++ b/lib/gpu/pair_gpu_atom.h @@ -23,7 +23,6 @@ #ifdef USE_OPENCL -#include "geryon/ocl_device.h" #include "geryon/ocl_timer.h" #include "geryon/ocl_mat.h" #include "geryon/ocl_kernel.h" @@ -32,7 +31,6 @@ using namespace ucl_opencl; #else #include "cudpp.h" -#include "geryon/nvd_device.h" #include "geryon/nvd_timer.h" #include "geryon/nvd_mat.h" #include "geryon/nvd_kernel.h" @@ -40,10 +38,6 @@ using namespace ucl_cudadr; #endif -#ifndef int2 -struct int2 { int x; int y; }; -#endif - #include "pair_gpu_precision.h" template @@ -56,13 +50,9 @@ class PairGPUAtom { inline int max_atoms() const { return _max_atoms; } /// Current number of local+ghost atoms stored inline int nall() const { return _nall; } - /// Current number of local atoms stored - inline int inum() const { return _inum; } /// Set number of local+ghost atoms for future copy operations inline void nall(const int n) { _nall=n; } - /// Set number of local atoms for future copy operations - inline void inum(const int n) { _inum=n; } /// Memory usage per atom in this class int bytes_per_atom() const; @@ -70,21 +60,33 @@ class PairGPUAtom { /// Clear any previous data and set up for a new LAMMPS run /** \param rot True if atom storage needs quaternions * \param gpu_nbor True if neighboring will be performed on device **/ - bool init(const int inum, const int nall, const bool charge, const bool rot, + bool init(const int nall, const bool charge, const bool rot, UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false); /// Check if we have enough device storage and realloc if not - inline bool resize(const int inum, const int nall, bool &success) { - _inum=inum; + /** Returns true if resized with any call during this timestep **/ + inline bool resize(const int nall, bool &success) { _nall=nall; - if (inum>_max_local || nall>_max_atoms) { + if (nall>_max_atoms) { clear_resize(); - success = success && alloc(inum,nall); - return true; + success = success && alloc(nall); + _resized=true; } - return false; + return _resized; } - + + /// If already initialized by another LAMMPS style, add fields as necessary + /** \param rot True if atom storage needs quaternions + * \param gpu_nbor True if neighboring will be performed on device **/ + bool add_fields(const bool charge, const bool rot, const bool gpu_nbor, + const bool bonds); + + /// Returns true if GPU is using charges + bool charge() { return _charge; } + + /// Returns true if GPU is using quaternions + bool quat() { return _rot; } + /// Only free matrices of length inum or nall for resizing void clear_resize(); @@ -100,28 +102,42 @@ class PairGPUAtom { /// Add copy times to timers inline void acc_timers() { time_pos.add_to_total(); - time_answer.add_to_total(); - if (_other) - time_other.add_to_total(); + if (_charge) + time_q.add_to_total(); + if (_rot) + time_quat.add_to_total(); } /// Add copy times to timers inline void zero_timers() { time_pos.zero(); - time_answer.zero(); - if (_other) - time_other.zero(); + if (_charge) + time_q.zero(); + if (_rot) + time_quat.zero(); } /// Return the total time for host/device data transfer + /** Zeros the total so that the atom times are only included once **/ inline double transfer_time() { - double total=time_pos.total_seconds()+time_answer.total_seconds(); - if (_other) total+=time_other.total_seconds(); + double total=time_pos.total_seconds(); + time_pos.zero_total(); + if (_charge) { + total+=time_q.total_seconds(); + time_q.zero_total(); + } + if (_rot) { + total+=time_q.total_seconds(); + time_quat.zero_total(); + } + return total; } /// Return the total time for data cast/pack - inline double cast_time() { return _time_cast; } + /** Zeros the time so that atom times are only included once **/ + inline double cast_time() + { double t=_time_cast; _time_cast=0.0; return t; } /// Pack LAMMPS atom type constants into matrix and copy to device template @@ -216,43 +232,52 @@ class PairGPUAtom { // -------------------------COPY TO GPU ---------------------------------- + /// Signal that we need to transfer atom data for next timestep + inline void data_unavail() + { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; } + /// Cast positions and types to write buffer inline void cast_x_data(double **host_ptr, const int *host_type) { - double t=MPI_Wtime(); - #ifdef GPU_CAST - memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); - memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); - #else - numtyp *_write_loc=host_x.begin(); - for (int i=0; i<_nall; i++) { - *_write_loc=host_ptr[i][0]; - _write_loc++; - *_write_loc=host_ptr[i][1]; - _write_loc++; - *_write_loc=host_ptr[i][2]; - _write_loc++; - *_write_loc=host_type[i]; - _write_loc++; + if (_x_avail==false) { + double t=MPI_Wtime(); + #ifdef GPU_CAST + memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); + memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); + #else + numtyp *_write_loc=host_x.begin(); + for (int i=0; i<_nall; i++) { + *_write_loc=host_ptr[i][0]; + _write_loc++; + *_write_loc=host_ptr[i][1]; + _write_loc++; + *_write_loc=host_ptr[i][2]; + _write_loc++; + *_write_loc=host_type[i]; + _write_loc++; + } + #endif + _time_cast+=MPI_Wtime()-t; } - #endif - _time_cast+=MPI_Wtime()-t; - } + } /// Copy positions and types to device asynchronously /** Copies nall() elements **/ inline void add_x_data(double **host_ptr, int *host_type) { time_pos.start(); - #ifdef GPU_CAST - ucl_copy(dev_x_cast,host_x_cast,_nall*3,true); - ucl_copy(dev_type_cast,host_type_cast,_nall,true); - int block_size=64; - int GX=static_cast(ceil(static_cast(_nall)/block_size)); - k_cast_x.set_size(GX,block_size); - k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), - &_nall); - #else - ucl_copy(dev_x,host_x,_nall*4,true); - #endif + if (_x_avail==false) { + #ifdef GPU_CAST + ucl_copy(dev_x_cast,host_x_cast,_nall*3,true); + ucl_copy(dev_type_cast,host_type_cast,_nall,true); + int block_size=64; + int GX=static_cast(ceil(static_cast(_nall)/block_size)); + k_cast_x.set_size(GX,block_size); + k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), + &_nall); + #else + ucl_copy(dev_x,host_x,_nall*4,true); + #endif + _x_avail=true; + } time_pos.stop(); } @@ -262,87 +287,68 @@ class PairGPUAtom { add_x_data(host_ptr,host_type); } - /// Cast charges to write buffer + // Cast charges to write buffer template inline void cast_q_data(cpytyp *host_ptr) { - double t=MPI_Wtime(); - if (dev->device_type()==UCL_CPU) { - if (sizeof(numtyp)==sizeof(double)) { - host_q.view((numtyp*)host_ptr,_nall,*dev); - dev_q.view(host_q); - } else - for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; - } else { - if (sizeof(numtyp)==sizeof(double)) - memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp)); - else - for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; + if (_q_avail==false) { + double t=MPI_Wtime(); + if (dev->device_type()==UCL_CPU) { + if (sizeof(numtyp)==sizeof(double)) { + host_q.view((numtyp*)host_ptr,_nall,*dev); + dev_q.view(host_q); + } else + for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; + } else { + if (sizeof(numtyp)==sizeof(double)) + memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp)); + else + for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; + } + _time_cast+=MPI_Wtime()-t; } - _time_cast+=MPI_Wtime()-t; } - /// Copy charges to device asynchronously + // Copy charges to device asynchronously inline void add_q_data() { - ucl_copy(dev_q,host_q,_nall,true); + if (_q_avail==false) { + ucl_copy(dev_q,host_q,_nall,true); + _q_avail=true; + } } - /// Cast quaternions to write buffer + // Cast quaternions to write buffer template inline void cast_quat_data(cpytyp *host_ptr) { - double t=MPI_Wtime(); - if (dev->device_type()==UCL_CPU) { - if (sizeof(numtyp)==sizeof(double)) { - host_quat.view((numtyp*)host_ptr,_nall*4,*dev); - dev_quat.view(host_quat); - } else - for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; - } else { - if (sizeof(numtyp)==sizeof(double)) - memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp)); - else - for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; + if (_quat_avail==false) { + double t=MPI_Wtime(); + if (dev->device_type()==UCL_CPU) { + if (sizeof(numtyp)==sizeof(double)) { + host_quat.view((numtyp*)host_ptr,_nall*4,*dev); + dev_quat.view(host_quat); + } else + for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; + } else { + if (sizeof(numtyp)==sizeof(double)) + memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp)); + else + for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; + } + _time_cast+=MPI_Wtime()-t; } - _time_cast+=MPI_Wtime()-t; } - /// Copy quaternions to device + // Copy quaternions to device /** Copies nall()*4 elements **/ inline void add_quat_data() { - ucl_copy(dev_quat,host_quat,_nall*4,true); + if (_quat_avail==false) { + ucl_copy(dev_quat,host_quat,_nall*4,true); + _quat_avail=true; + } } - /// Copy data other than pos and data to device - inline void add_other_data() { - time_other.start(); - if (_charge) - add_q_data(); - if (_rot) - add_quat_data(); - time_other.stop(); - } - /// Return number of bytes used on device - inline double gpu_bytes() { return _gpu_bytes; } - - // -------------------------COPY FROM GPU ------------------------------- - - /// Copy answers from device into read buffer asynchronously - void copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom); - - /// Copy answers from device into read buffer asynchronously - void copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom, int *ilist); - - /// Copy energy and virial data into LAMMPS memory - double energy_virial(double *eatom, double **vatom, double *virial); - - /// Copy energy and virial data into LAMMPS memory - double energy_virial(double *eatom, double **vatom, double *virial, - double &ecoul); - - /// Add forces and torques from the GPU into a LAMMPS pointer - void get_answers(double **f, double **tor); + inline double max_gpu_bytes() + { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } // ------------------------------ DATA ---------------------------------- @@ -352,10 +358,6 @@ class PairGPUAtom { UCL_D_Vec dev_q; /// Quaterions UCL_D_Vec dev_quat; - /// Force and possibly torque - UCL_D_Vec dev_ans; - /// Energy and virial per-atom storage - UCL_D_Vec dev_engv; #ifdef GPU_CAST UCL_D_Vec dev_x_cast; @@ -370,10 +372,6 @@ class PairGPUAtom { UCL_H_Vec host_q; /// Buffer for moving quat data to GPU UCL_H_Vec host_quat; - /// Force and possibly torque data on host - UCL_H_Vec host_ans; - /// Energy/virial data on host - UCL_H_Vec host_engv; /// Cell list identifiers for device nbor builds UCL_D_Vec dev_cell_id; @@ -383,7 +381,7 @@ class PairGPUAtom { UCL_D_Vec dev_tag; /// Device timers - UCL_Timer time_pos, time_other, time_answer; + UCL_Timer time_pos, time_q, time_quat; /// Geryon device UCL_Device *dev; @@ -396,19 +394,19 @@ class PairGPUAtom { #endif bool _compiled; - - bool alloc(const int inum, const int nall); - bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; - int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields; + // True if data has been copied to device already + bool _x_avail, _q_avail, _quat_avail, _resized; + + bool alloc(const int nall); + + bool _allocated, _rot, _charge, _other; + int _max_atoms, _nall; bool _gpu_nbor, _bonds; - int *_ilist; double _time_cast; - double _gpu_bytes; + double _max_gpu_bytes; - bool _newton; - #ifndef USE_OPENCL CUDPPConfiguration sort_config; CUDPPHandle sort_plan; diff --git a/lib/gpu/pair_gpu_balance.h b/lib/gpu/pair_gpu_balance.h index a3a0f61a62..9e14ad60d8 100644 --- a/lib/gpu/pair_gpu_balance.h +++ b/lib/gpu/pair_gpu_balance.h @@ -23,7 +23,7 @@ #define _HD_BALANCE_EVERY 25 #define _HD_BALANCE_WEIGHT 0.5 -#define _HD_BALANCE_GAP 1.05 +#define _HD_BALANCE_GAP 1.10 /// Host/device load balancer template @@ -33,7 +33,8 @@ class PairGPUBalance { inline ~PairGPUBalance() { clear(); } /// Clear any old data and setup for new LAMMPS run - inline void init(PairGPUDevice *gpu, const double split); + inline void init(PairGPUDevice *gpu, const bool gpu_nbor, + const double split); /// Clear all host and device data inline void clear() { @@ -43,23 +44,25 @@ class PairGPUBalance { _init_done=false; } } + + /// Return the timestep since initialization + inline int timestep() { return _timestep; } /// Get a count of the number of particles host will handle for initial alloc - inline int first_host_count(const int nlocal,const bool gpu_nbor, - const double gpu_split) const { + inline int first_host_count(const int nlocal, const double gpu_split, + const bool gpu_nbor) const { int host_nlocal=0; if (gpu_nbor && gpu_split!=1.0) { if (gpu_split>0) host_nlocal=static_cast(ceil((1.0-gpu_split)*nlocal)); else - host_nlocal=static_cast(ceil(0.1*nlocal)); + host_nlocal=static_cast(ceil(0.05*nlocal)); } return host_nlocal; } /// Return the number of particles the device will handle this timestep - inline int get_gpu_count(const int timestep, const int ago, - const int inum_full); + inline int get_gpu_count(const int ago, const int inum_full); /// Return the average fraction of particles handled by device on all procs inline double all_avg_split() { @@ -82,10 +85,10 @@ class PairGPUBalance { if (_measure_this_step) { _device->gpu->sync(); _device->gpu_barrier(); + _device->start_host_timer(); _device_time.start(); _device->gpu->sync(); _device->gpu_barrier(); - _device->start_host_timer(); } } @@ -95,34 +98,34 @@ class PairGPUBalance { /// Calculate the new host/device split based on the cpu and device times /** \note Only does calculation every _HD_BALANCE_EVERY timesteps (and first 10) **/ - inline void balance(const double cpu_time, const bool gpu_nbor); + inline void balance(const double cpu_time); /// Calls balance() and then get_gpu_count() - inline int balance(const int timestep, const int ago, const int inum_full, - const double cpu_time, const bool gpu_nbor) { - balance(cpu_time,gpu_nbor); - return get_gpu_count(timestep,ago,inum_full); + inline int balance(const int ago,const int inum_full,const double cpu_time) { + balance(cpu_time); + return get_gpu_count(ago,inum_full); } private: PairGPUDevice *_device; UCL_Timer _device_time; - bool _init_done; + bool _init_done, _gpu_nbor; bool _load_balance; double _actual_split, _avg_split, _desired_split, _max_split; int _avg_count; bool _measure_this_step; - int _inum, _inum_full; + int _inum, _inum_full, _timestep; }; #define PairGPUBalanceT PairGPUBalance template -void PairGPUBalanceT::init(PairGPUDevice *gpu, - const double split) { +void PairGPUBalanceT::init(PairGPUDevice *gpu, + const bool gpu_nbor, const double split) { clear(); + _gpu_nbor=gpu_nbor; _init_done=true; _device=gpu; @@ -130,7 +133,7 @@ void PairGPUBalanceT::init(PairGPUDevice *gpu, if (split<0.0) { _load_balance=true; - _desired_split=0.9; + _desired_split=0.90; } else { _load_balance=false; _desired_split=split; @@ -138,14 +141,14 @@ void PairGPUBalanceT::init(PairGPUDevice *gpu, _actual_split=_desired_split; _avg_split=0.0; _avg_count=0; + _timestep=0; } template -int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago, - const int inum_full) { +int PairGPUBalanceT::get_gpu_count(const int ago, const int inum_full) { _measure_this_step=false; if (_load_balance) { - if (_avg_count<11 || timestep%_HD_BALANCE_EVERY==0) { + if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) { _measure_this_step=true; _inum_full=inum_full; } @@ -156,44 +159,44 @@ int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago, } _inum=static_cast(floor(_actual_split*inum_full)); if (_inum==0) _inum++; + _timestep++; return _inum; } template -void PairGPUBalanceT::balance(const double cpu_time, const bool gpu_nbor) { +void PairGPUBalanceT::balance(const double cpu_time) { if (_measure_this_step) { + _measure_this_step=false; + double gpu_time=_device_time.seconds(); + + double max_gpu_time; + MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX, + _device->gpu_comm()); + if (_inum_full==_inum) { _desired_split=1.0; return; } - _measure_this_step=false; - double gpu_time=_device_time.seconds(); + double cpu_time_per_atom=cpu_time/(_inum_full-_inum); + double cpu_other_time=_device->host_time()-cpu_time; + int host_inum=static_cast((max_gpu_time-cpu_other_time)/ + cpu_time_per_atom); - double cpu_gpu_time[3], max_times[3]; - cpu_gpu_time[0]=cpu_time/(_inum_full-_inum); - cpu_gpu_time[1]=gpu_time/_inum; - cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full; + double split=static_cast(_inum_full-host_inum)/_inum_full; + _desired_split=split*_HD_BALANCE_GAP; + if (_desired_split>1.0) + _desired_split=1.0; + if (_desired_split<0.0) + _desired_split=0.0; - MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX, - _device->gpu_comm()); - double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]); - split*=_HD_BALANCE_GAP; - - if (split>1.0) - split=1.0; - if (_avg_count<10) - _desired_split=(_desired_split*_avg_count+split)/(_avg_count+1); - else - _desired_split=_desired_split*(1.0-_HD_BALANCE_WEIGHT)+ - _HD_BALANCE_WEIGHT*split; - - if (!gpu_nbor) { + if (!_gpu_nbor) { if (_desired_split<_max_split) _actual_split=_desired_split; else _actual_split=_max_split; } +//std::cout << gpu_time << " " << max_gpu_time << " " << cpu_other_time << " " << cpu_time_per_atom << " " << cpu_time << " " << _desired_split << " " << host_inum << std::endl; } _avg_split+=_desired_split; _avg_count++; diff --git a/lib/gpu/pair_gpu_build_kernel.cu b/lib/gpu/pair_gpu_build_kernel.cu index bcf41c0050..33742a4cba 100644 --- a/lib/gpu/pair_gpu_build_kernel.cu +++ b/lib/gpu/pair_gpu_build_kernel.cu @@ -18,7 +18,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture neigh_tex; #ifdef _DOUBLE_DOUBLE @@ -36,6 +36,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #else #define fetch_pos(i,y) x_[i] +#define BLOCK_NBOR_BUILD 64 #endif @@ -54,29 +55,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define numtyp4 float4 #endif -#define CELL_BLOCK_SIZE 64 -#define BLOCK_2D 8 +#define BLOCK_CELL_2D 8 + +#define SBBITS 30 #define SBBITS 30 __kernel void transpose(int *out, int *in, int columns_in, int rows_in) { - __local float block[BLOCK_2D][BLOCK_2D+1]; + __local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; unsigned ti=THREAD_ID_X; unsigned tj=THREAD_ID_Y; unsigned bi=BLOCK_ID_X; unsigned bj=BLOCK_ID_Y; - unsigned i=bi*BLOCK_2D+ti; - unsigned j=bj*BLOCK_2D+tj; + unsigned i=bi*BLOCK_CELL_2D+ti; + unsigned j=bj*BLOCK_CELL_2D+tj; if ((ipid_i) { - diff.x = atom_i.x - pos_sh[j].x; - diff.y = atom_i.y - pos_sh[j].y; - diff.z = atom_i.z - pos_sh[j].z; + diff.x = atom_i.x - pos_sh[j].x; + diff.y = atom_i.y - pos_sh[j].y; + diff.z = atom_i.z - pos_sh[j].z; - r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; - if (r2 < cell_size*cell_size && r2 > 1e-5) { - if (cnt < neigh_bin_size) { - *neigh_list = pid_j; - neigh_list+=stride; - } - cnt++; - } - } + r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; + if (r2 < cell_size*cell_size && r2 > 1e-5) { + if (cnt < neigh_bin_size) { + *neigh_list = pid_j; + neigh_list+=stride; + } + cnt++; + } } } __syncthreads(); @@ -249,9 +250,10 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos, } __kernel void kernel_special(__global int *dev_nbor, - __global int *host_nbor_list, __global int *tag, + __global int *host_nbor_list, + __global int *host_numj, __global int *tag, __global int *nspecial, __global int *special, - int inum, int nt, int nall) { + int inum, int nt, int nall, int max_nbors) { // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X; @@ -263,15 +265,17 @@ __kernel void kernel_special(__global int *dev_nbor, int n2=nspecial[ii*3+1]; int n3=nspecial[ii*3+2]; + int numj; if (ii < inum) { stride=inum; list=dev_nbor+stride+ii; + numj=*list; + list+=stride; } else { - stride=nt-inum; - list=host_nbor_list+ii-inum; + stride=1; + list=host_nbor_list+(ii-inum)*max_nbors; + numj=host_numj[ii-inum]; } - int numj=*list; - list+=stride; list_end=list+numj*stride; for ( ; list #include +#ifdef _OPENMP +#include +#endif + +#ifdef USE_OPENCL +#include "pair_gpu_dev_cl.h" +#else +#include "pair_gpu_dev_ptx.h" +#endif #define PairGPUDeviceT PairGPUDevice template PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false), _gpu_mode(GPU_FORCE), _first_device(0), - _last_device(0) { + _last_device(0), _compiled(false) { } template @@ -34,14 +43,19 @@ PairGPUDeviceT::~PairGPUDevice() { } template -bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, - const int first_gpu, const int last_gpu, - const int gpu_mode, const double p_split, - const int nthreads) { +int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, + const int first_gpu, const int last_gpu, + const int gpu_mode, const double p_split, + const int nthreads, const int t_per_atom) { _nthreads=nthreads; + #ifdef _OPENMP + omp_set_num_threads(nthreads); + #endif + _threads_per_atom=t_per_atom; + _threads_per_charge=t_per_atom; if (_device_init) - return true; + return 0; _device_init=true; _comm_world=world; _comm_replica=replica; @@ -96,7 +110,12 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, // set the device ID _procs_per_gpu=static_cast(ceil(static_cast(procs_per_node)/ (last_gpu-first_gpu+1))); - int my_gpu=node_rank/_procs_per_gpu; + int my_gpu=node_rank/_procs_per_gpu+first_gpu; + + // Time on the device only if 1 proc per gpu + _time_device=true; + if (_procs_per_gpu>1) + _time_device=false; // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); @@ -104,39 +123,109 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, gpu=new UCL_Device(); if (my_gpu>=gpu->num_devices()) - return false; + return -2; gpu->set(my_gpu); - return true; + + _long_range_precompute=0; + + int flag=compile_kernels(); + + return flag; } template -bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, - const int host_nlocal, const int nall, - const int maxspecial, const bool gpu_nbor, - const int gpu_host, const int max_nbors, - const double cell_size, const bool pre_cut) { +int PairGPUDeviceT::init(PairGPUAns &ans, const bool charge, + const bool rot, const int nlocal, + const int host_nlocal, const int nall, + PairGPUNbor *nbor, const int maxspecial, + const int gpu_host, const int max_nbors, + const double cell_size, const bool pre_cut) { if (!_device_init) - return false; + return -1; + if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) + return -5; + + // Counts of data transfers for timing overhead estimates + _data_in_estimate=0; + _data_out_estimate=1; + + // Initial number of local particles + int ef_nlocal=nlocal; + if (_particle_split<1.0 && _particle_split>0.0) + ef_nlocal=static_cast(_particle_split*nlocal); + + bool gpu_nbor=false; + if (_gpu_mode==GPU_NEIGH) + gpu_nbor=true; + if (_init_count==0) { // Initialize atom and nbor data - int ef_nlocal=nlocal; - if (_particle_split<1.0 && _particle_split>0.0) - ef_nlocal=static_cast(_particle_split*nlocal); - if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor, - gpu_nbor && maxspecial>0)) - return false; - if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor, - gpu_host,pre_cut)) - return false; - nbor.cell_size(cell_size); + if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0)) + return -3; + + _data_in_estimate++; + if (charge) + _data_in_estimate++; + if (rot) + _data_in_estimate++; } else { - if (cell_size>nbor.cell_size()) - nbor.cell_size(cell_size); + if (atom.charge()==false && charge) + _data_in_estimate++; + if (atom.quat()==false && rot) + _data_in_estimate++; + if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial)) + return -3; } + + if (!ans.init(ef_nlocal,charge,rot,*gpu)) + return -3; + + if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, + *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, + _block_cell_id, _block_nbor_build)) + return -3; + nbor->cell_size(cell_size); _init_count++; - return true; + return 0; +} + +template +int PairGPUDeviceT::init(PairGPUAns &ans, const int nlocal, + const int nall) { + if (!_device_init) + return -1; + if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) + return -5; + + if (_init_count==0) { + // Initialize atom and nbor data + if (!atom.init(nall,true,false,*gpu,false,false)) + return -3; + } else + if (!atom.add_fields(true,false,false,false)) + return -3; + + if (!ans.init(nlocal,true,false,*gpu)) + return -3; + + _init_count++; + return 0; +} + +template +void PairGPUDeviceT::set_single_precompute + (PPPMGPUMemory *pppm) { + _long_range_precompute=1; + pppm_single=pppm; +} + +template +void PairGPUDeviceT::set_double_precompute + (PPPMGPUMemory *pppm) { + _long_range_precompute=2; + pppm_double=pppm; } template @@ -152,11 +241,17 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name, fprintf(screen,"\n-------------------------------------"); fprintf(screen,"-------------------------------------\n"); fprintf(screen,"- Using GPGPU acceleration for %s:\n",name); - fprintf(screen,"- with %d procs per device.\n",_procs_per_gpu); + fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu); + #ifdef _OPENMP + fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads); + #endif fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n"); - for (int i=first_gpu; i<=last_gpu; i++) { + int last=last_gpu+1; + if (last>gpu->num_devices()) + last=gpu->num_devices(); + for (int i=first_gpu; iname(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+ " GHZ ("; @@ -177,32 +272,152 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name, } template -void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split, - const double max_bytes, FILE *screen) { - double single[5], times[5]; +void PairGPUDeviceT::estimate_gpu_overhead(const int kernel_calls, + double &gpu_overhead, + double &gpu_driver_overhead) { + UCL_H_Vec *host_data_in=NULL, *host_data_out=NULL; + UCL_D_Vec *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL; + UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL; + UCL_Timer over_timer(*gpu); - single[0]=atom.transfer_time(); + if (_data_in_estimate>0) { + host_data_in=new UCL_H_Vec[_data_in_estimate]; + dev_data_in=new UCL_D_Vec[_data_in_estimate]; + timers_in=new UCL_Timer[_data_in_estimate]; + } + + if (_data_out_estimate>0) { + host_data_out=new UCL_H_Vec[_data_out_estimate]; + dev_data_out=new UCL_D_Vec[_data_out_estimate]; + timers_out=new UCL_Timer[_data_out_estimate]; + } + + if (kernel_calls>0) { + kernel_data=new UCL_D_Vec[kernel_calls]; + timers_kernel=new UCL_Timer[kernel_calls]; + } + + for (int i=0; i<_data_in_estimate; i++) { + host_data_in[i].alloc(1,*gpu); + dev_data_in[i].alloc(1,*gpu); + timers_in[i].init(*gpu); + } + + for (int i=0; i<_data_out_estimate; i++) { + host_data_out[i].alloc(1,*gpu); + dev_data_out[i].alloc(1,*gpu); + timers_out[i].init(*gpu); + } + + for (int i=0; isync(); + gpu_barrier(); + over_timer.start(); + gpu->sync(); + gpu_barrier(); + + double driver_time=MPI_Wtime(); + for (int i=0; i<_data_in_estimate; i++) { + timers_in[i].start(); + ucl_copy(dev_data_in[i],host_data_in[i],true); + timers_in[i].stop(); + } + + for (int i=0; i0) { + delete [] host_data_in; + delete [] dev_data_in; + delete [] timers_in; + } + + if (_data_out_estimate>0) { + delete [] host_data_out; + delete [] dev_data_out; + delete [] timers_out; + } + + if (kernel_calls>0) { + delete [] kernel_data; + delete [] timers_kernel; + } +} + +template +void PairGPUDeviceT::output_times(UCL_Timer &time_pair, + PairGPUAns &ans, + PairGPUNbor &nbor, const double avg_split, + const double max_bytes, + const double gpu_overhead, + const double driver_overhead, + const int threads_per_atom, FILE *screen) { + double single[8], times[8]; + + single[0]=atom.transfer_time()+ans.transfer_time(); single[1]=nbor.time_nbor.total_seconds(); single[2]=nbor.time_kernel.total_seconds(); single[3]=time_pair.total_seconds(); - single[4]=atom.cast_time(); + single[4]=atom.cast_time()+ans.cast_time(); + single[5]=gpu_overhead; + single[6]=driver_overhead; + single[7]=ans.cpu_idle_time(); - MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica); + MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica); - double my_max_bytes=max_bytes; + double my_max_bytes=max_bytes+atom.max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); if (replica_me()==0) - if (screen && times[3]>0.0) { + if (screen && times[5]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); fprintf(screen," GPU Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (procs_per_gpu()==1) { + if (time_device()) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); @@ -212,7 +427,71 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split, fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); } + fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Threads / atom: %d.\n",threads_per_atom); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size); + fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size); + + fprintf(screen,"-------------------------------------"); + fprintf(screen,"--------------------------------\n\n"); + } +} + +template +void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, + UCL_Timer &time_out, + UCL_Timer &time_map, + UCL_Timer &time_rho, + UCL_Timer &time_interp, + PairGPUAns &ans, + const double max_bytes, + const double cpu_time, + const double idle_time, FILE *screen) { + double single[8], times[8]; + + single[0]=time_out.total_seconds(); + single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time(); + single[2]=time_map.total_seconds(); + single[3]=time_rho.total_seconds(); + single[4]=time_interp.total_seconds(); + single[5]=ans.transfer_time()+ans.cast_time(); + single[6]=cpu_time; + single[7]=idle_time; + + MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica); + + double my_max_bytes=max_bytes+atom.max_gpu_bytes(); + double mpi_max_bytes; + MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); + double max_mb=mpi_max_bytes/(1024.0*1024.0); + + if (replica_me()==0) + if (screen && times[6]>0.0) { + fprintf(screen,"\n\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + fprintf(screen," GPU Time Info (average): "); + fprintf(screen,"\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + + if (time_device()) { + fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size); + fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size); + fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size); + fprintf(screen,"Kernel (rho): %.4f s.\n",times[3]/_replica_size); + fprintf(screen,"Force interp: %.4f s.\n",times[4]/_replica_size); + fprintf(screen,"Total rho: %.4f s.\n", + (times[0]+times[2]+times[3])/_replica_size); + fprintf(screen,"Total interp: %.4f s.\n", + (times[1]+times[4])/_replica_size); + fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size); + fprintf(screen,"Total: %.4f s.\n", + (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/ + _replica_size); + } + fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size); + fprintf(screen,"CPU Idle Time: %.4f s.\n",times[7]/_replica_size); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"-------------------------------------"); @@ -223,10 +502,17 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split, template void PairGPUDeviceT::clear() { if (_init_count>0) { + _long_range_precompute=0; _init_count--; if (_init_count==0) { atom.clear(); - nbor.clear(); + _nbor_shared.clear(); + if (_compiled) { + k_zero.clear(); + k_info.clear(); + delete dev_program; + _compiled=false; + } } } } @@ -241,21 +527,80 @@ void PairGPUDeviceT::clear_device() { } } +template +int PairGPUDeviceT::compile_kernels() { + int flag=0; + + if (_compiled) + return flag; + + std::string flags="-cl-mad-enable"; + dev_program=new UCL_Program(*gpu); + int success=dev_program->load_string(pair_gpu_dev_kernel,flags.c_str()); + if (success!=UCL_SUCCESS) + return -4; + k_zero.set_function(*dev_program,"kernel_zero"); + k_info.set_function(*dev_program,"kernel_info"); + _compiled=true; + + UCL_H_Vec h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED); + UCL_D_Vec d_gpu_lib_data(14,*gpu); + k_info.set_size(1,1); + k_info.run(&d_gpu_lib_data.begin()); + ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false); + + #ifndef USE_OPENCL + if (static_cast(h_gpu_lib_data[0])/100.0>gpu->arch()) + return -4; + #endif + + _num_mem_threads=h_gpu_lib_data[1]; + _warp_size=h_gpu_lib_data[2]; + if (_threads_per_atom<1) + _threads_per_atom=h_gpu_lib_data[3]; + if (_threads_per_charge<1) + _threads_per_charge=h_gpu_lib_data[13]; + _pppm_max_spline=h_gpu_lib_data[4]; + _pppm_block=h_gpu_lib_data[5]; + _block_pair=h_gpu_lib_data[6]; + _max_shared_types=h_gpu_lib_data[7]; + _block_cell_2d=h_gpu_lib_data[8]; + _block_cell_id=h_gpu_lib_data[9]; + _block_nbor_build=h_gpu_lib_data[10]; + _block_bio_pair=h_gpu_lib_data[11]; + _max_bio_shared_types=h_gpu_lib_data[12]; + + if (static_cast(_block_pair)>gpu->group_size()) + _block_pair=gpu->group_size(); + if (static_cast(_block_bio_pair)>gpu->group_size()) + _block_bio_pair=gpu->group_size(); + if (_threads_per_atom>_warp_size) + _threads_per_atom=_warp_size; + if (_warp_size%_threads_per_atom!=0) + _threads_per_atom=1; + if (_threads_per_charge>_warp_size) + _threads_per_charge=_warp_size; + if (_warp_size%_threads_per_charge!=0) + _threads_per_charge=1; + + return flag; +} + template double PairGPUDeviceT::host_memory_usage() const { - return atom.host_memory_usage()+ - nbor.host_memory_usage()+4*sizeof(numtyp)+ + return atom.host_memory_usage()+4*sizeof(numtyp)+ sizeof(PairGPUDevice); } template class PairGPUDevice; PairGPUDevice pair_gpu_device; -bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, - const int last_gpu, const int gpu_mode, - const double particle_split, const int nthreads) { +int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, + const int last_gpu, const int gpu_mode, + const double particle_split, const int nthreads, + const int t_per_atom) { return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, - particle_split,nthreads); + particle_split,nthreads,t_per_atom); } void lmp_clear_device() { @@ -264,14 +609,5 @@ void lmp_clear_device() { double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul) { - if (pair_gpu_device.init_count()) { - pair_gpu_device.stop_host_timer(); - pair_gpu_device.gpu->sync(); - double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul); - pair_gpu_device.atom.get_answers(f,tor); - - return evdw; - } - return 0.0; + return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul); } - diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index 33aa54959b..1e7e15e6a8 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -19,11 +19,17 @@ #define PAIR_GPU_DEVICE_H #include "pair_gpu_atom.h" +#include "pair_gpu_ans.h" #include "pair_gpu_nbor.h" +#include "pppm_gpu_memory.h" #include "mpi.h" #include #include "stdio.h" #include +#include + +template class PPPMGPUMemory; template class PairGPUDevice { @@ -33,10 +39,15 @@ class PairGPUDevice { /// Initialize the device for use by this process /** Sets up a per-device MPI communicator for load balancing and initializes - * the device (>=first_gpu and <=last_gpu) that this proc will be using **/ - bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, + * the device (>=first_gpu and <=last_gpu) that this proc will be using + * Returns: + * - 0 if successfull + * - -2 if GPU not found + * - -4 if GPU library not compiled for GPU **/ + int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, - const double particle_split, const int nthreads); + const double particle_split, const int nthreads, + const int t_per_atom); /// Initialize the device for Atom and Neighbor storage /** \param rot True if quaternions need to be stored @@ -50,19 +61,67 @@ class PairGPUDevice { * \param max_nbors Initial number of rows in the neighbor matrix * \param cell_size cutoff+skin * \param pre_cut True if cutoff test will be performed in separate kernel - * than the force kernel **/ - bool init(const bool charge, const bool rot, const int nlocal, - const int host_nlocal, const int nall, const int maxspecial, - const bool gpu_nbor, const int gpu_host, const int max_nbors, - const double cell_size, const bool pre_cut); + * than the force kernel + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(PairGPUAns &a, const bool charge, const bool rot, + const int nlocal, const int host_nlocal, const int nall, + PairGPUNbor *nbor, const int maxspecial, const int gpu_host, + const int max_nbors, const double cell_size, const bool pre_cut); + + /// Initialize the device for Atom storage only + /** \param nlocal Total number of local particles to allocate memory for + * \param nall Total number of local+ghost particles + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(PairGPUAns &ans, const int nlocal, const int nall); /// Output a message for pair_style acceleration with device stats void init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu); + /// Perform charge assignment asynchronously for PPPM + void set_single_precompute(PPPMGPUMemory *pppm); + + /// Perform charge assignment asynchronously for PPPM + void set_double_precompute(PPPMGPUMemory *pppm); + + /// Esimate the overhead from GPU calls from multiple procs + /** \param kernel_calls Number of kernel calls/timestep for timing estimated + * overhead + * \param gpu_overhead Estimated gpu overhead per timestep (sec) + * \param driver_overhead Estimated overhead from driver per timestep (s) **/ + void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead, + double &gpu_driver_overhead); + + /// Returns true if double precision is supported on card + inline bool double_precision() { return gpu->double_precision(); } + /// Output a message with timing information - void output_times(UCL_Timer &time_pair, const double avg_split, - const double max_bytes, FILE *screen); + void output_times(UCL_Timer &time_pair, PairGPUAns &ans, + PairGPUNbor &nbor, const double avg_split, + const double max_bytes, const double gpu_overhead, + const double driver_overhead, + const int threads_per_atom, FILE *screen); + + /// Output a message with timing information + void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out, + UCL_Timer & time_map, UCL_Timer & time_rho, + UCL_Timer &time_interp, + PairGPUAns &ans, + const double max_bytes, const double cpu_time, + const double cpu_idle_time, FILE *screen); /// Clear all memory on host and device associated with atom and nbor data void clear(); @@ -70,11 +129,37 @@ class PairGPUDevice { /// Clear all memory on host and device void clear_device(); + /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS + inline void add_ans_object(PairGPUAns *ans) + { ans_queue.push(ans); } + + /// Add "answers" (force,energies,etc.) into LAMMPS structures + inline double fix_gpu(double **f, double **tor, double *eatom, + double **vatom, double *virial, double &ecoul) { + atom.data_unavail(); + if (ans_queue.empty()==false) { + stop_host_timer(); + double evdw=0.0; + while (ans_queue.empty()==false) { + evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul); + ans_queue.pop(); + } + return evdw; + } + return 0.0; + } + /// Start timer on host - inline void start_host_timer() { _cpu_full=MPI_Wtime(); } + inline void start_host_timer() + { _cpu_full=MPI_Wtime(); _host_timer_started=true; } /// Stop timer on host - inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; } + inline void stop_host_timer() { + if (_host_timer_started) { + _cpu_full=MPI_Wtime()-_cpu_full; + _host_timer_started=false; + } + } /// Return host time inline double host_time() { return _cpu_full; } @@ -114,6 +199,42 @@ class PairGPUDevice { inline double particle_split() const { return _particle_split; } /// Return the initialization count for the device inline int init_count() const { return _init_count; } + /// True if device is being timed + inline bool time_device() const { return _time_device; } + + /// Return the number of threads accessing memory simulatenously + inline int num_mem_threads() const { return _num_mem_threads; } + /// Return the number of threads per atom for pair styles + inline int threads_per_atom() const { return _threads_per_atom; } + /// Return the number of threads per atom for pair styles using charge + inline int threads_per_charge() const { return _threads_per_charge; } + /// Return the min of the pair block size or the device max block size + inline int pair_block_size() const { return _block_pair; } + /// Return the maximum number of atom types that can be used with shared mem + inline int max_shared_types() const { return _max_shared_types; } + /// Return the maximum order for PPPM splines + inline int pppm_max_spline() const { return _pppm_max_spline; } + /// Return the block size for PPPM kernels + inline int pppm_block() const { return _pppm_block; } + /// Return the block size for neighbor binning + inline int block_cell_2d() const { return _block_cell_2d; } + /// Return the block size for atom mapping for neighbor builds + inline int block_cell_id() const { return _block_cell_id; } + /// Return the block size for neighbor build kernel + inline int block_nbor_build() const { return _block_nbor_build; } + /// Return the block size for "bio" pair styles + inline int block_bio_pair() const { return _block_bio_pair; } + /// Return the maximum number of atom types for shared mem with "bio" styles + inline int max_bio_shared_types() const { return _max_bio_shared_types; } + + // -------------------- SHARED DEVICE ROUTINES -------------------- + // Perform asynchronous zero of integer array + void zero(UCL_D_Vec &mem, const int numel) { + int num_blocks=static_cast(ceil(static_cast(numel)/ + _block_pair)); + k_zero.set_size(num_blocks,_block_pair); + k_zero.run(&mem.begin(),&numel); + } // -------------------------- DEVICE DATA ------------------------- @@ -130,11 +251,30 @@ class PairGPUDevice { // --------------------------- NBOR DATA ---------------------------- /// Neighbor Data - PairGPUNbor nbor; + PairGPUNborShared _nbor_shared; + + // ------------------------ LONG RANGE DATA ------------------------- + + // Long Range Data + int _long_range_precompute; + PPPMGPUMemory *pppm_single; + PPPMGPUMemory *pppm_double; + /// Precomputations for long range charge assignment (asynchronously) + inline void precompute(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *charge, double *boxlo, double *prd) { + if (_long_range_precompute==1) + pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge, + boxlo,prd); + else if (_long_range_precompute==2) + pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge, + boxlo,prd); + } private: + std::queue *> ans_queue; int _init_count; - bool _device_init; + bool _device_init, _host_timer_started, _time_device; MPI_Comm _comm_world, _comm_replica, _comm_gpu; int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, _replica_size; @@ -142,6 +282,19 @@ class PairGPUDevice { double _particle_split; double _cpu_full; + int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge; + int _pppm_max_spline, _pppm_block; + int _block_pair, _max_shared_types; + int _block_cell_2d, _block_cell_id, _block_nbor_build; + int _block_bio_pair, _max_bio_shared_types; + + UCL_Program *dev_program; + UCL_Kernel k_zero, k_info; + bool _compiled; + int compile_kernels(); + + int _data_in_estimate, _data_out_estimate; + template inline std::string toa(const t& in) { std::ostringstream o; diff --git a/lib/gpu/pair_gpu_nbor.cpp b/lib/gpu/pair_gpu_nbor.cpp index 123fbe54aa..df138a7eff 100644 --- a/lib/gpu/pair_gpu_nbor.cpp +++ b/lib/gpu/pair_gpu_nbor.cpp @@ -18,15 +18,9 @@ #include "pair_gpu_precision.h" #include "pair_gpu_nbor.h" +#include "pair_gpu_device.h" #include "math.h" -#ifdef USE_OPENCL -#include "pair_gpu_nbor_cl.h" -#else -#include "pair_gpu_nbor_ptx.h" -#include "pair_gpu_build_ptx.h" -#endif - int PairGPUNbor::bytes_per_atom(const int max_nbors) const { if (_gpu_nbor) return (max_nbors+2)*sizeof(int); @@ -36,12 +30,18 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const { return (max_nbors+3)*sizeof(int); } -bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors, +bool PairGPUNbor::init(PairGPUNborShared *shared, const int inum, + const int host_inum, const int max_nbors, const int maxspecial, UCL_Device &devi, const bool gpu_nbor, const int gpu_host, - const bool pre_cut) { + const bool pre_cut, const int block_cell_2d, + const int block_cell_id, const int block_nbor_build) { clear(); + _block_cell_2d=block_cell_2d; + _block_cell_id=block_cell_id; + _block_nbor_build=block_nbor_build; + _shared=shared; dev=&devi; _gpu_nbor=gpu_nbor; if (gpu_host==0) @@ -80,8 +80,11 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors, success=success && (host_packed.alloc(2*IJ_SIZE,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); alloc(success); + if (!success) + return false; + if (_use_packing==false) - compile_kernels(devi); + _shared->compile_kernels(devi,gpu_nbor); return success; } @@ -89,13 +92,14 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors, void PairGPUNbor::alloc(bool &success) { dev_nbor.clear(); host_acc.clear(); + int nt=_max_atoms+_max_host; if (_use_packing==false || _gpu_nbor) success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); else success=success && (dev_nbor.alloc(3*_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); - success=success && (host_acc.alloc((_max_atoms+_max_host)*2,*dev, + success=success && (host_acc.alloc(nt*2,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); _c_bytes=dev_nbor.row_bytes(); @@ -108,11 +112,31 @@ void PairGPUNbor::alloc(bool &success) { if (_max_host>0) { host_nbor.clear(); dev_host_nbor.clear(); - success=success && (host_nbor.alloc((_max_nbors+1)*_max_host,*dev, + dev_host_numj.clear(); + host_ilist.clear(); + host_jlist.clear(); + + success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev, UCL_RW_OPTIMIZED)==UCL_SUCCESS); - success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host, + success=success && (dev_host_nbor.alloc(_max_nbors*_max_host, *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); - _c_bytes+=dev_host_nbor.row_bytes(); + success=success && (dev_host_numj.alloc(_max_host,*dev, + UCL_WRITE_ONLY)==UCL_SUCCESS); + success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); + if (!success) + return; + for (int i=0; i0) { dev_nspecial.clear(); @@ -145,6 +169,9 @@ void PairGPUNbor::clear() { dev_host_nbor.clear(); dev_packed.clear(); host_nbor.clear(); + dev_host_numj.clear(); + host_ilist.clear(); + host_jlist.clear(); dev_nspecial.clear(); dev_special.clear(); dev_special_t.clear(); @@ -152,27 +179,13 @@ void PairGPUNbor::clear() { time_kernel.clear(); time_nbor.clear(); } - - if (_compiled) { - if (_gpu_nbor) { - k_cell_id.clear(); - k_cell_counts.clear(); - k_build_nbor.clear(); - k_transpose.clear(); - k_special.clear(); - delete build_program; - } else { - k_nbor.clear(); - delete nbor_program; - } - _compiled=false; - } } double PairGPUNbor::host_memory_usage() const { if (_gpu_nbor) { if (_gpu_host) - return host_nbor.row_bytes()*host_nbor.rows(); + return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+ + host_jlist.row_bytes(); else return 0; } else @@ -186,7 +199,7 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj, UCL_H_Vec ilist_view; ilist_view.view(ilist,inum,*dev); - ucl_copy(dev_nbor,ilist_view,true); + ucl_copy(dev_nbor,ilist_view,false); UCL_D_Vec nbor_offset; UCL_H_Vec host_offset; @@ -238,46 +251,20 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj, if (_use_packing==false) { time_kernel.start(); int GX=static_cast(ceil(static_cast(inum)/block_size)); - k_nbor.set_size(GX,block_size); - k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum); + _shared->k_nbor.set_size(GX,block_size); + _shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum); time_kernel.stop(); } } -void PairGPUNbor::compile_kernels(UCL_Device &dev) { - std::string flags="-cl-fast-relaxed-math -cl-mad-enable"; - - if (_gpu_nbor==false) { - nbor_program=new UCL_Program(dev); - nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str()); - k_nbor.set_function(*nbor_program,"kernel_unpack"); - } else { - build_program=new UCL_Program(dev); - #ifdef USE_OPENCL - std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n"; - exit(1); - #else - build_program->load_string(pair_gpu_build_kernel,flags.c_str()); - #endif - k_cell_id.set_function(*build_program,"calc_cell_id"); - k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts"); - k_build_nbor.set_function(*build_program,"calc_neigh_list_cell"); - k_transpose.set_function(*build_program,"transpose"); - k_special.set_function(*build_program,"kernel_special"); - neigh_tex.get_texture(*build_program,"neigh_tex"); - } - _compiled=true; -} - template void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, const int nall, PairGPUAtom &atom, - double *boxlo, double *boxhi, int *tag, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, bool &success, int &mn) { const int nt=inum+host_inum; - if (_maxspecial>0) { time_nbor.start(); UCL_H_Vec view_nspecial, view_special, view_tag; @@ -290,25 +277,25 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, time_nbor.stop(); time_nbor.add_to_total(); time_kernel.start(); - const int b2x=8; - const int b2y=8; + const int b2x=_block_cell_2d; + const int b2y=_block_cell_2d; const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); const int g2y=static_cast(ceil(static_cast(nt)/b2y)); - k_transpose.set_size(g2x,g2y,b2x,b2y); - k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),&_maxspecial, - &nt); + _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); + _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(), + &_maxspecial,&nt); } else time_kernel.start(); _nbor_pitch=inum; - neigh_tex.bind_float(atom.dev_x,4); + _shared->neigh_tex.bind_float(atom.dev_x,4); int ncellx, ncelly, ncellz, ncell_3d; - ncellx = static_cast(ceil(((boxhi[0] - boxlo[0]) + + ncellx = static_cast(ceil(((subhi[0] - sublo[0]) + 2.0*_cell_size)/_cell_size)); - ncelly = static_cast(ceil(((boxhi[1] - boxlo[1]) + + ncelly = static_cast(ceil(((subhi[1] - sublo[1]) + 2.0*_cell_size)/_cell_size)); - ncellz = static_cast(ceil(((boxhi[2] - boxlo[2]) + + ncellz = static_cast(ceil(((subhi[2] - sublo[2]) + 2.0*_cell_size)/_cell_size)); ncell_3d = ncellx * ncelly * ncellz; UCL_D_Vec cell_counts; @@ -316,35 +303,36 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, _cell_bytes=cell_counts.row_bytes(); /* build cell list on GPU */ - const int neigh_block=128; + const int neigh_block=_block_cell_id; const int GX=(int)ceil((float)nall/neigh_block); - const numtyp boxlo0=static_cast(boxlo[0]); - const numtyp boxlo1=static_cast(boxlo[1]); - const numtyp boxlo2=static_cast(boxlo[2]); - const numtyp boxhi0=static_cast(boxhi[0]); - const numtyp boxhi1=static_cast(boxhi[1]); - const numtyp boxhi2=static_cast(boxhi[2]); + const numtyp sublo0=static_cast(sublo[0]); + const numtyp sublo1=static_cast(sublo[1]); + const numtyp sublo2=static_cast(sublo[2]); + const numtyp subhi0=static_cast(subhi[0]); + const numtyp subhi1=static_cast(subhi[1]); + const numtyp subhi2=static_cast(subhi[2]); const numtyp cell_size_cast=static_cast(_cell_size); - k_cell_id.set_size(GX,neigh_block); - k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), - &atom.dev_particle_id.begin(), - &boxlo0, &boxlo1, &boxlo2, &boxhi0, &boxhi1, - &boxhi2, &cell_size_cast, &ncellx, &ncelly, &nall); + _shared->k_cell_id.set_size(GX,neigh_block); + _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), + &atom.dev_particle_id.begin(), + &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, + &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall); atom.sort_neighbor(nall); /* calculate cell count */ - k_cell_counts.set_size(GX,neigh_block); - k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), &nall, - &ncell_3d); + _shared->k_cell_counts.set_size(GX,neigh_block); + _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), + &nall, &ncell_3d); /* build the neighbor list */ - const int cell_block=64; - k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1); - k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(), - &cell_counts.begin(), &dev_nbor.begin(), - &dev_host_nbor.begin(), &_max_nbors, &cell_size_cast, - &ncellx, &ncelly, &ncellz, &inum, &nt, &nall); + const int cell_block=_block_nbor_build; + _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1); + _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(), + &cell_counts.begin(), &dev_nbor.begin(), + &dev_host_nbor.begin(), &dev_host_numj.begin(), + &_max_nbors,&cell_size_cast, + &ncellx, &ncelly, &ncellz, &inum, &nt, &nall); /* Get the maximum number of nbors and realloc if necessary */ UCL_D_Vec numj; @@ -353,7 +341,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, if (nt>inum) { UCL_H_Vec host_offset; host_offset.view_offset(inum,host_acc,nt-inum); - ucl_copy(host_offset,dev_host_nbor,nt-inum,false); + ucl_copy(host_offset,dev_host_numj,nt-inum,false); } mn=host_acc[0]; for (int i=1; i0) { host_nbor.clear(); dev_host_nbor.clear(); - success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor, + success=success && (host_nbor.alloc(mn*_max_host,dev_nbor, UCL_RW_OPTIMIZED)==UCL_SUCCESS); - success=success && (dev_host_nbor.alloc((mn+1)*_max_host, + success=success && (dev_host_nbor.alloc(mn*_max_host, dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS); + int *ptr=host_nbor.begin(); + for (int i=0; i<_max_host; i++) { + host_jlist[i]=ptr; + ptr+=mn; + } _gpu_bytes+=dev_host_nbor.row_bytes(); } if (_alloc_packed) { @@ -385,28 +378,29 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, _max_nbors=mn; time_kernel.stop(); time_kernel.add_to_total(); - build_nbor_list(inum, host_inum, nall, atom, boxlo, boxhi, tag, nspecial, + build_nbor_list(inum, host_inum, nall, atom, sublo, subhi, tag, nspecial, special, success, mn); return; } if (_maxspecial>0) { const int GX2=static_cast(ceil(static_cast(nt)/cell_block)); - k_special.set_size(GX2,cell_block); - k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), - &atom.dev_tag.begin(), &dev_nspecial.begin(), - &dev_special.begin(), &inum, &nt, &nall); + _shared->k_special.set_size(GX2,cell_block); + _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), + &dev_host_numj.begin(), &atom.dev_tag.begin(), + &dev_nspecial.begin(), &dev_special.begin(), + &inum, &nt, &nall, &_max_nbors); } time_kernel.stop(); time_nbor.start(); if (_gpu_host) - ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false); + ucl_copy(host_nbor,dev_host_nbor,false); time_nbor.stop(); } template void PairGPUNbor::build_nbor_list - (const int inum, const int host_inum, const int nall, - PairGPUAtom &atom, double *boxlo, double *boxhi, + (const int inum, const int host_inum, const int nall, + PairGPUAtom &atom, double *sublo, double *subhi, int *, int **, int **, bool &success, int &mn); diff --git a/lib/gpu/pair_gpu_nbor.h b/lib/gpu/pair_gpu_nbor.h index 403bd7aed4..02ad4b201b 100644 --- a/lib/gpu/pair_gpu_nbor.h +++ b/lib/gpu/pair_gpu_nbor.h @@ -19,32 +19,27 @@ #define PAIR_GPU_NBOR_H #include "pair_gpu_atom.h" +#include "pair_gpu_nbor_shared.h" #define IJ_SIZE 131072 #ifdef USE_OPENCL -#include "geryon/ocl_device.h" #include "geryon/ocl_timer.h" #include "geryon/ocl_mat.h" -#include "geryon/ocl_kernel.h" -#include "geryon/ocl_texture.h" using namespace ucl_opencl; #else -#include "geryon/nvd_device.h" #include "geryon/nvd_timer.h" #include "geryon/nvd_mat.h" -#include "geryon/nvd_kernel.h" -#include "geryon/nvd_texture.h" using namespace ucl_cudadr; #endif class PairGPUNbor { public: - PairGPUNbor() : _allocated(false), _use_packing(false), _compiled(false) {} + PairGPUNbor() : _allocated(false), _use_packing(false) {} ~PairGPUNbor() { clear(); } /// Determine whether neighbor unpacking should be used @@ -62,9 +57,11 @@ class PairGPUNbor { * 2 if gpu_nbor is true, and host needs a full nbor list * \param pre_cut True if cutoff test will be performed in separate kernel * than the force kernel **/ - bool init(const int inum, const int host_inum, const int max_nbors, - const int maxspecial, UCL_Device &dev, const bool gpu_nbor, - const int gpu_host, const bool pre_cut); + bool init(PairGPUNborShared *shared, const int inum, const int host_inum, + const int max_nbors, const int maxspecial, UCL_Device &dev, + const bool gpu_nbor, const int gpu_host, const bool pre_cut, + const int block_cell_2d, const int block_cell_id, + const int block_nbor_build); /// Set the size of the cutoff+skin inline void cell_size(const double size) { _cell_size=size; } @@ -131,18 +128,18 @@ class PairGPUNbor { inline int max_nbors() const { return _max_nbors; } /// Loop through neighbor count array and return maximum nbors for a particle - inline int max_nbor_loop(const int inum, int *numj) const { + inline int max_nbor_loop(const int inum, int *numj, int *ilist) const { int mn=0; for (int i=0; i void build_nbor_list(const int inum, const int host_inum, const int nall, - PairGPUAtom &atom, double *boxlo, - double *boxhi, int *tag, int **nspecial, int **special, + PairGPUAtom &atom, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, bool &success, int &max_nbors); /// Return the number of bytes used on device @@ -176,31 +173,31 @@ class PairGPUNbor { UCL_H_Vec host_nbor; /// Device storage for neighbor list matrix that will be copied to host /** - 1st row is numj - * - Remaining rows are nbors **/ + * - Remaining rows are by atom, columns are nbors **/ UCL_D_Vec dev_host_nbor; + UCL_D_Vec dev_host_numj; + UCL_H_Vec host_ilist; + UCL_H_Vec host_jlist; /// Device storage for special neighbor counts UCL_D_Vec dev_nspecial; /// Device storage for special neighbors UCL_D_Vec dev_special, dev_special_t; - /// Texture for cached position/type access with CUDA - UCL_Texture neigh_tex; /// Device timers UCL_Timer time_nbor, time_kernel; private: + PairGPUNborShared *_shared; UCL_Device *dev; - UCL_Program *nbor_program, *build_program; - UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor; - UCL_Kernel k_transpose, k_special; - bool _allocated, _use_packing, _compiled; - void compile_kernels(UCL_Device &dev); + bool _allocated, _use_packing; int _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial; bool _gpu_nbor, _gpu_host, _alloc_packed; double _cell_size; double _gpu_bytes, _c_bytes, _cell_bytes; void alloc(bool &success); + + int _block_cell_2d, _block_cell_id, _block_nbor_build; }; #endif diff --git a/lib/gpu/pair_gpu_precision.h b/lib/gpu/pair_gpu_precision.h index a5f57c1f95..902975be0b 100644 --- a/lib/gpu/pair_gpu_precision.h +++ b/lib/gpu/pair_gpu_precision.h @@ -84,8 +84,6 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { #define acctyp4 _lgpu_float4 #endif -#define MAX_SHARED_TYPES 8 -#define MAX_BIO_SHARED_TYPES 128 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #endif From 6321bca76958bf1ef635f60f40763942b15aaf6f Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 2 May 2011 15:02:58 +0000 Subject: [PATCH 18/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6054 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/MAKE/Makefile.altix | 2 +- src/MAKE/Makefile.bgl | 2 +- src/MAKE/Makefile.cygwin | 2 +- src/MAKE/Makefile.encanto | 2 +- src/MAKE/Makefile.fink | 2 +- src/MAKE/Makefile.g++ | 2 +- src/MAKE/Makefile.g++3 | 2 +- src/MAKE/Makefile.glory | 2 +- src/MAKE/Makefile.jaguar | 2 +- src/MAKE/Makefile.lam | 2 +- src/MAKE/Makefile.linux | 2 +- src/MAKE/Makefile.mac | 2 +- src/MAKE/Makefile.mac_mpi | 2 +- src/MAKE/Makefile.mingw | 2 +- src/MAKE/Makefile.mkl | 2 +- src/MAKE/Makefile.odin | 2 +- src/MAKE/Makefile.openmpi | 2 +- src/MAKE/Makefile.pgi | 2 +- src/MAKE/Makefile.power5 | 2 +- src/MAKE/Makefile.qed | 2 +- src/MAKE/Makefile.redsky | 2 +- src/MAKE/Makefile.sdsc | 2 +- src/MAKE/Makefile.seaborg | 2 +- src/MAKE/Makefile.serial | 2 +- src/MAKE/Makefile.serial_debug | 2 +- src/MAKE/Makefile.sgi | 2 +- src/MAKE/Makefile.solaris | 2 +- src/MAKE/Makefile.spirit | 2 +- src/MAKE/Makefile.storm | 2 +- src/MAKE/Makefile.tacc | 2 +- src/MAKE/Makefile.tbird | 2 +- src/MAKE/Makefile.tesla | 2 +- src/MAKE/Makefile.tunnison | 2 +- src/MAKE/Makefile.xt3 | 2 +- 34 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/MAKE/Makefile.altix b/src/MAKE/Makefile.altix index 26c07247cc..982a822aa3 100644 --- a/src/MAKE/Makefile.altix +++ b/src/MAKE/Makefile.altix @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.bgl b/src/MAKE/Makefile.bgl index e8d25d745d..0568d33d53 100644 --- a/src/MAKE/Makefile.bgl +++ b/src/MAKE/Makefile.bgl @@ -63,7 +63,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.cygwin b/src/MAKE/Makefile.cygwin index cb35ccbf34..0d6264848f 100644 --- a/src/MAKE/Makefile.cygwin +++ b/src/MAKE/Makefile.cygwin @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.encanto b/src/MAKE/Makefile.encanto index 14a5b7a359..69f8d434ad 100644 --- a/src/MAKE/Makefile.encanto +++ b/src/MAKE/Makefile.encanto @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.fink b/src/MAKE/Makefile.fink index f6a1afa400..0474e0246a 100644 --- a/src/MAKE/Makefile.fink +++ b/src/MAKE/Makefile.fink @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.g++ b/src/MAKE/Makefile.g++ index 2381f7235c..147ac4f388 100755 --- a/src/MAKE/Makefile.g++ +++ b/src/MAKE/Makefile.g++ @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lgfortran reax_SYSLIB = -lgfortran user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.g++3 b/src/MAKE/Makefile.g++3 index 1b35f3c2c0..ddafa913d9 100755 --- a/src/MAKE/Makefile.g++3 +++ b/src/MAKE/Makefile.g++3 @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.glory b/src/MAKE/Makefile.glory index 9e9f9345bd..00312daf4b 100644 --- a/src/MAKE/Makefile.glory +++ b/src/MAKE/Makefile.glory @@ -75,7 +75,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.jaguar b/src/MAKE/Makefile.jaguar index 50b9934486..07ee4436e4 100644 --- a/src/MAKE/Makefile.jaguar +++ b/src/MAKE/Makefile.jaguar @@ -63,7 +63,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.lam b/src/MAKE/Makefile.lam index f7ad21bc5e..0082b29699 100644 --- a/src/MAKE/Makefile.lam +++ b/src/MAKE/Makefile.lam @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.linux b/src/MAKE/Makefile.linux index 15df43de64..a82f1347bb 100755 --- a/src/MAKE/Makefile.linux +++ b/src/MAKE/Makefile.linux @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.mac b/src/MAKE/Makefile.mac index 0a6dfd9bc7..4ac8beaacd 100755 --- a/src/MAKE/Makefile.mac +++ b/src/MAKE/Makefile.mac @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.mac_mpi b/src/MAKE/Makefile.mac_mpi index e28fa06033..c0e2aa4170 100755 --- a/src/MAKE/Makefile.mac_mpi +++ b/src/MAKE/Makefile.mac_mpi @@ -60,7 +60,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lgfortran reax_SYSLIB = -lgfortran user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.mingw b/src/MAKE/Makefile.mingw index d727478859..81132902fb 100644 --- a/src/MAKE/Makefile.mingw +++ b/src/MAKE/Makefile.mingw @@ -58,7 +58,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.mkl b/src/MAKE/Makefile.mkl index 40abca8301..4d6cd22545 100644 --- a/src/MAKE/Makefile.mkl +++ b/src/MAKE/Makefile.mkl @@ -63,7 +63,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.odin b/src/MAKE/Makefile.odin index b370ed16c8..cabb2dc4ea 100755 --- a/src/MAKE/Makefile.odin +++ b/src/MAKE/Makefile.odin @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.openmpi b/src/MAKE/Makefile.openmpi index fb86bb4c57..0fcf6fb650 100644 --- a/src/MAKE/Makefile.openmpi +++ b/src/MAKE/Makefile.openmpi @@ -58,7 +58,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.pgi b/src/MAKE/Makefile.pgi index 20dcd71ea8..c945e91124 100644 --- a/src/MAKE/Makefile.pgi +++ b/src/MAKE/Makefile.pgi @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.power5 b/src/MAKE/Makefile.power5 index 7ea1b76e2b..616e8f31dc 100644 --- a/src/MAKE/Makefile.power5 +++ b/src/MAKE/Makefile.power5 @@ -58,7 +58,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.qed b/src/MAKE/Makefile.qed index 35961016b1..bf008ef744 100644 --- a/src/MAKE/Makefile.qed +++ b/src/MAKE/Makefile.qed @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.redsky b/src/MAKE/Makefile.redsky index 2f9b7a880e..579e527ff7 100644 --- a/src/MAKE/Makefile.redsky +++ b/src/MAKE/Makefile.redsky @@ -86,7 +86,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = ${BLASLIB} diff --git a/src/MAKE/Makefile.sdsc b/src/MAKE/Makefile.sdsc index c7a438d630..21acdaa375 100644 --- a/src/MAKE/Makefile.sdsc +++ b/src/MAKE/Makefile.sdsc @@ -62,7 +62,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.seaborg b/src/MAKE/Makefile.seaborg index 5134fa34ec..d8398506cb 100644 --- a/src/MAKE/Makefile.seaborg +++ b/src/MAKE/Makefile.seaborg @@ -58,7 +58,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.serial b/src/MAKE/Makefile.serial index 7f431b3c7e..64cf13db29 100755 --- a/src/MAKE/Makefile.serial +++ b/src/MAKE/Makefile.serial @@ -58,7 +58,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lgfortran reax_SYSLIB = -lgfortran user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.serial_debug b/src/MAKE/Makefile.serial_debug index 771d60b260..9dad10864f 100644 --- a/src/MAKE/Makefile.serial_debug +++ b/src/MAKE/Makefile.serial_debug @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lgfortran reax_SYSLIB = -lgfortran user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.sgi b/src/MAKE/Makefile.sgi index 0b3e03edd1..bb1c8fc254 100644 --- a/src/MAKE/Makefile.sgi +++ b/src/MAKE/Makefile.sgi @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.solaris b/src/MAKE/Makefile.solaris index 27da3064c6..44ca0148ea 100644 --- a/src/MAKE/Makefile.solaris +++ b/src/MAKE/Makefile.solaris @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.spirit b/src/MAKE/Makefile.spirit index e998af1a72..db5b557fa4 100644 --- a/src/MAKE/Makefile.spirit +++ b/src/MAKE/Makefile.spirit @@ -65,7 +65,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.storm b/src/MAKE/Makefile.storm index 65c838e9ba..8519c35291 100644 --- a/src/MAKE/Makefile.storm +++ b/src/MAKE/Makefile.storm @@ -58,7 +58,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.tacc b/src/MAKE/Makefile.tacc index 0ed8f6f34c..045695597b 100644 --- a/src/MAKE/Makefile.tacc +++ b/src/MAKE/Makefile.tacc @@ -60,7 +60,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore reax_SYSLIB = -lifcore user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.tbird b/src/MAKE/Makefile.tbird index 32783ea237..de21c8fdfe 100644 --- a/src/MAKE/Makefile.tbird +++ b/src/MAKE/Makefile.tbird @@ -76,7 +76,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.tesla b/src/MAKE/Makefile.tesla index 9b35a739c8..337fa9e0f5 100755 --- a/src/MAKE/Makefile.tesla +++ b/src/MAKE/Makefile.tesla @@ -57,7 +57,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack diff --git a/src/MAKE/Makefile.tunnison b/src/MAKE/Makefile.tunnison index d0b2a2efdf..2afd374aa9 100644 --- a/src/MAKE/Makefile.tunnison +++ b/src/MAKE/Makefile.tunnison @@ -68,7 +68,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -llapack diff --git a/src/MAKE/Makefile.xt3 b/src/MAKE/Makefile.xt3 index a3089716dc..2c610c7bc7 100644 --- a/src/MAKE/Makefile.xt3 +++ b/src/MAKE/Makefile.xt3 @@ -59,7 +59,7 @@ JPG_LIB = # SYSLIB = names of libraries # SYSPATH = paths of libraries -gpu_SYSLIB = -lcudart +gpu_SYSLIB = -lcudart -lcuda meam_SYSLIB = -lifcore -lsvml -lompstub -limf reax_SYSLIB = -lifcore -lsvml -lompstub -limf user-atc_SYSLIB = -lblas -llapack From 96fb599b2d3d5814e74b19f49475d22e5e686f5c Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 2 May 2011 15:03:20 +0000 Subject: [PATCH 19/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6055 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/GPU/Install.sh | 25 +++ src/GPU/fix_gpu.cpp | 48 ++-- src/GPU/fix_gpu.h | 2 + src/GPU/pair_cg_cmm_coul_long_gpu.cpp | 274 +++++------------------ src/GPU/pair_cg_cmm_coul_long_gpu.h | 3 +- src/GPU/pair_cg_cmm_gpu.cpp | 203 ++++------------- src/GPU/pair_cg_cmm_gpu.h | 3 +- src/GPU/pair_gayberne_gpu.cpp | 268 ++++++---------------- src/GPU/pair_gayberne_gpu.h | 5 +- src/GPU/pair_lj96_cut_gpu.cpp | 166 ++++---------- src/GPU/pair_lj96_cut_gpu.h | 3 +- src/GPU/pair_lj_charmm_coul_long_gpu.cpp | 264 ++++++---------------- src/GPU/pair_lj_charmm_coul_long_gpu.h | 3 +- src/GPU/pair_lj_cut_coul_cut_gpu.cpp | 192 ++++------------ src/GPU/pair_lj_cut_coul_cut_gpu.h | 3 +- src/GPU/pair_lj_cut_coul_long_gpu.cpp | 225 ++++--------------- src/GPU/pair_lj_cut_coul_long_gpu.h | 3 +- src/GPU/pair_lj_cut_gpu.cpp | 165 ++++---------- src/GPU/pair_lj_cut_gpu.h | 3 +- 19 files changed, 491 insertions(+), 1367 deletions(-) diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index 29504865b4..a17dc9ffd5 100644 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -14,6 +14,15 @@ if (test $1 = 1) then sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(gpu_SYSLIB) |' ../Makefile.package fi + if (test -e ../pppm.cpp) then + cp pppm_gpu.cpp .. + cp pppm_gpu_single.cpp .. + cp pppm_gpu_double.cpp .. + cp pppm_gpu.h .. + cp pppm_gpu_single.h .. + cp pppm_gpu_double.h .. + fi + if (test -e ../pair_gayberne.cpp) then cp pair_gayberne_gpu.cpp .. cp pair_gayberne_gpu.h .. @@ -40,14 +49,19 @@ if (test $1 = 1) then fi cp pair_lj_cut_gpu.cpp .. + cp pair_morse_gpu.cpp .. cp pair_lj96_cut_gpu.cpp .. + cp pair_lj_expand_gpu.cpp .. cp pair_lj_cut_coul_cut_gpu.cpp .. cp pair_lj_cut_gpu.h .. + cp pair_morse_gpu.h .. cp pair_lj96_cut_gpu.h .. + cp pair_lj_expand_gpu.h .. cp pair_lj_cut_coul_cut_gpu.h .. cp fix_gpu.cpp .. cp fix_gpu.h .. + cp gpu_extra.h .. elif (test $1 = 0) then @@ -56,9 +70,14 @@ elif (test $1 = 0) then sed -i -e 's/[^ \t]*gpu_[^ \t]*) //' ../Makefile.package fi + rm ../pppm_gpu.cpp + rm ../pppm_gpu_single.cpp + rm ../pppm_gpu_double.cpp rm ../pair_gayberne_gpu.cpp rm ../pair_lj_cut_gpu.cpp + rm ../pair_morse_gpu.cpp rm ../pair_lj96_cut_gpu.cpp + rm ../pair_lj_expand_gpu.cpp rm ../pair_lj_cut_coul_cut_gpu.cpp rm ../pair_lj_cut_coul_long_gpu.cpp rm ../pair_lj_charmm_coul_long_gpu.cpp @@ -66,15 +85,21 @@ elif (test $1 = 0) then rm ../pair_cg_cmm_coul_long_gpu.cpp rm ../fix_gpu.cpp + rm ../pppm_gpu.h + rm ../pppm_gpu_single.cpp + rm ../pppm_gpu_double.h rm ../pair_gayberne_gpu.h rm ../pair_lj_cut_gpu.h + rm ../pair_morse_gpu.h rm ../pair_lj96_cut_gpu.h + rm ../pair_lj_expand_gpu.h rm ../pair_lj_cut_coul_cut_gpu.h rm ../pair_lj_cut_coul_long_gpu.h rm ../pair_lj_charmm_coul_long_gpu.h rm ../pair_cg_cmm_gpu.h rm ../pair_cg_cmm_coul_long_gpu.h rm ../fix_gpu.h + rm ../gpu_extra.h fi diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 75ce1e83f3..54721900e6 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -24,15 +24,16 @@ #include "modify.h" #include "domain.h" #include "universe.h" +#include "gpu_extra.h" using namespace LAMMPS_NS; enum{GPU_FORCE, GPU_NEIGH}; -extern bool lmp_init_device(MPI_Comm world, MPI_Comm replica, - const int first_gpu, const int last_gpu, - const int gpu_mode, const double particle_split, - const int nthreads); +extern int lmp_init_device(MPI_Comm world, MPI_Comm replica, + const int first_gpu, const int last_gpu, + const int gpu_mode, const double particle_split, + const int nthreads, const int t_per_atom); extern void lmp_clear_device(); extern double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul); @@ -42,18 +43,17 @@ extern double lmp_gpu_forces(double **f, double **tor, double *eatom, FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) { - if (narg != 7) error->all("Illegal fix gpu command"); + if (narg < 7) error->all("Illegal fix gpu command"); if (strcmp(arg[1],"all") != 0) error->all("Illegal fix gpu command"); - int gpu_mode, first_gpu, last_gpu; - double particle_split; + int first_gpu, last_gpu; if (strcmp(arg[3],"force") == 0) - gpu_mode = GPU_FORCE; + _gpu_mode = GPU_FORCE; else if (strcmp(arg[3],"force/neigh") == 0) { - gpu_mode = GPU_NEIGH; + _gpu_mode = GPU_NEIGH; if (domain->triclinic) error->all("Cannot use force/neigh with triclinic box."); } else @@ -62,13 +62,24 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : first_gpu = atoi(arg[4]); last_gpu = atoi(arg[5]); - particle_split = force->numeric(arg[6]); - if (particle_split==0 || particle_split>1) + _particle_split = force->numeric(arg[6]); + if (_particle_split==0 || _particle_split>1) error->all("Illegal fix gpu command."); - if (!lmp_init_device(universe->uworld,world,first_gpu,last_gpu,gpu_mode, - particle_split,1)) - error->one("Could not find or initialize a specified accelerator device."); + int nthreads = 1; + int threads_per_atom = -1; + if (narg == 9) { + if (strcmp(arg[7],"threads_per_atom") == 0) + threads_per_atom = atoi(arg[8]); + else + error->all("Illegal fix gpu command."); + } else if (narg != 7) + error->all("Illegal fix gpu command."); + + int gpu_flag = lmp_init_device(universe->uworld, world, first_gpu, last_gpu, + _gpu_mode, _particle_split, nthreads, + threads_per_atom); + GPU_EXTRA::check_flag(gpu_flag,error,world); } /* ---------------------------------------------------------------------- */ @@ -95,6 +106,15 @@ void FixGPU::init() // Can only have 1 gpu fix that must be the first fix for a run if ((void*)modify->fix[0] != (void*)this) error->all("GPU is not the first fix for this run."); + // Hybrid cannot be used with force/neigh option + if (_gpu_mode == GPU_NEIGH) + if (force->pair_match("hybrid",1) != NULL || + force->pair_match("hybrid/overlay",1) != NULL) + error->all("Cannot use pair hybrid with GPU neighbor builds."); + if (_particle_split < 0) + if (force->pair_match("hybrid",1) != NULL || + force->pair_match("hybrid/overlay",1) != NULL) + error->all("Fix gpu split must be positive for hybrid pair styles."); } /* ---------------------------------------------------------------------- */ diff --git a/src/GPU/fix_gpu.h b/src/GPU/fix_gpu.h index 35c8dea324..30b53ac879 100644 --- a/src/GPU/fix_gpu.h +++ b/src/GPU/fix_gpu.h @@ -37,6 +37,8 @@ class FixGPU : public Fix { double memory_usage(); private: + int _gpu_mode; + double _particle_split; }; } diff --git a/src/GPU/pair_cg_cmm_coul_long_gpu.cpp b/src/GPU/pair_cg_cmm_coul_long_gpu.cpp index 6d11692d5c..153cb98a9e 100644 --- a/src/GPU/pair_cg_cmm_coul_long_gpu.cpp +++ b/src/GPU/pair_cg_cmm_coul_long_gpu.cpp @@ -35,6 +35,7 @@ #include "domain.h" #include "string.h" #include "kspace.h" +#include "gpu_extra.h" #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) @@ -49,27 +50,29 @@ // External functions from cuda library for atom decomposition -bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald); +int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald); void cmml_gpu_clear(); -int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q); -void cmml_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q); +int ** cmml_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd); +void cmml_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd); double cmml_gpu_bytes(); using namespace LAMMPS_NS; @@ -95,8 +98,6 @@ PairCGCMMCoulLongGPU::~PairCGCMMCoulLongGPU() void PairCGCMMCoulLongGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -104,31 +105,32 @@ void PairCGCMMCoulLongGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = cmml_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, - atom->q); + firstneigh = cmml_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd); } else { inum = list->inum; - cmml_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success, atom->q); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + cmml_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one("Out of memory on GPGPU"); if (host_startq_flag) error->all("Pair style cg/cmm/coul/long requires atom attribute q"); - if (force->pair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU cg/cmm pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -176,17 +178,13 @@ void PairCGCMMCoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3, - lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen, cut_ljsq, - cut_coulsq_global, force->special_coul, - force->qqrd2e, g_ewald); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU cg/cmm pair style"); + int success = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3, + lj4, offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen, cut_ljsq, + cut_coulsq_global, force->special_coul, + force->qqrd2e, g_ewald); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -205,14 +203,16 @@ double PairCGCMMCoulLongGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag) +void PairCGCMMCoulLongGPU::cpu_compute(int start, int inum, int eflag, + int vflag, int *ilist, int *numneigh, + int **firstneigh) { - int i,j,ii,jj,inum,jnum,itype,jtype,itable; + int i,j,ii,jj,jnum,itype,jtype,itable; double qtmp,xtmp,ytmp,ztmp,delx,dely,delz; double fraction,table; double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; double grij,expm2,prefactor,t,erfc; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double rsq; double **x = atom->x; @@ -225,11 +225,6 @@ void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag) double *special_lj = force->special_lj; double qqrd2e = force->qqrd2e; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -244,13 +239,9 @@ void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag) for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; - - if (j < nall) factor_coul = factor_lj = 1.0; - else { - factor_coul = special_coul[j/nall]; - factor_lj = special_lj[j/nall]; - j %= nall; - } + factor_lj = special_lj[sbmask(j)]; + factor_coul = special_coul[sbmask(j)]; + j &= NEIGHMASK; const double delx = xtmp - x[j][0]; const double dely = ytmp - x[j][1]; @@ -347,156 +338,3 @@ void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag) } } } - -/* ---------------------------------------------------------------------- */ - -void PairCGCMMCoulLongGPU::cpu_compute(int *nbors, int start, int eflag, - int vflag) -{ - int i,j,jnum,itype,jtype,itable; - double qtmp,xtmp,ytmp,ztmp,delx,dely,delz; - double fraction,table; - double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; - double grij,expm2,prefactor,t,erfc; - double rsq; - - double **x = atom->x; - double **f = atom->f; - double *q = atom->q; - int *type = atom->type; - int nlocal = atom->nlocal; - int nall = nlocal + atom->nghost; - int stride = nlocal-start; - double *special_coul = force->special_coul; - double *special_lj = force->special_lj; - double qqrd2e = force->qqrd2e; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - qtmp = q[i]; - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor>= ncoulshiftbits; - const double fraction = (rsq_lookup.f - rtable[itable]) * - drtable[itable]; - const double table = ftable[itable] + fraction*dftable[itable]; - forcecoul = qtmp*q[j] * table; - if (eflag) { - const double table2 = etable[itable] + fraction*detable[itable]; - ecoul = qtmp*q[j] * table2; - } - if (factor_coul < 1.0) { - const double table2 = ctable[itable] + fraction*dctable[itable]; - const double prefactor = qtmp*q[j] * table2; - forcecoul -= (1.0-factor_coul)*prefactor; - if (eflag) ecoul -= (1.0-factor_coul)*prefactor; - } - } - } - fpair = (forcecoul + forcelj) * r2inv; - - f[i][0] += delx*fpair; - f[i][1] += dely*fpair; - f[i][2] += delz*fpair; - - if (j (b) ? (a) : (b)) // External functions from cuda library for atom decomposition -bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen); +int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen); void cmm_gpu_clear(); -int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); -void cmm_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success); +int ** cmm_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success); +void cmm_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); double cmm_gpu_bytes(); using namespace LAMMPS_NS; @@ -84,8 +85,6 @@ PairCGCMMGPU::~PairCGCMMGPU() void PairCGCMMGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -93,30 +92,30 @@ void PairCGCMMGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = cmm_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success); + firstneigh = cmm_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success); } else { inum = list->inum; - cmm_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + cmm_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); } if (!success) error->one("Out of memory on GPGPU"); if (host_startpair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU CGCMM pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -152,15 +151,11 @@ void PairCGCMMGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4, - offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU CGCMM pair style"); + int success = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -179,11 +174,13 @@ double PairCGCMMGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) { - int i,j,ii,jj,inum,jnum,itype,jtype; +void PairCGCMMGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, int **firstneigh) +{ + int i,j,ii,jj,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,r2inv,r6inv,forcelj,factor_lj; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double **x = atom->x; double **f = atom->f; @@ -192,11 +189,6 @@ void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) { int nall = nlocal + atom->nghost; double *special_lj = force->special_lj; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -210,12 +202,8 @@ void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) { for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; - - if (j < nall) factor_lj = 1.0; - else { - factor_lj = special_lj[j/nall]; - j %= nall; - } + factor_lj = special_lj[sbmask(j)]; + j &= NEIGHMASK; delx = xtmp - x[j][0]; dely = ytmp - x[j][1]; @@ -266,100 +254,3 @@ void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) { } } } - -/* ---------------------------------------------------------------------- */ - -void PairCGCMMGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) { - int i,j,itype,jtype; - int nlocal = atom->nlocal; - int nall = nlocal + atom->nghost; - int stride = nlocal-start; - double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; - double rsq,r2inv,r6inv,forcelj,factor_lj; - double *special_lj = force->special_lj; - - double **x = atom->x; - double **f = atom->f; - int *type = atom->type; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - int jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor (b) ? (a) : (b)) // External functions from cuda library for atom decomposition -bool gb_gpu_init(const int ntypes, const double gamma, const double upsilon, - const double mu, double **shape, double **well, double **cutsq, - double **sigma, double **epsilon, double *host_lshape, - int **form, double **host_lj1, double **host_lj2, - double **host_lj3, double **host_lj4, double **offset, - double *special_lj, const int nlocal, const int nall, - const int max_nbors, const double cell_size, - int &gpu_mode, FILE *screen); +int gb_gpu_init(const int ntypes, const double gamma, const double upsilon, + const double mu, double **shape, double **well, double **cutsq, + double **sigma, double **epsilon, double *host_lshape, + int **form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, + double *special_lj, const int nlocal, const int nall, + const int max_nbors, const double cell_size, + int &gpu_mode, FILE *screen); void gb_gpu_clear(); -int * gb_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, const double cpu_time, bool &success, - double **host_quat); -int * gb_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double **host_quat); +int ** gb_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double **host_quat); +int * gb_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double **host_quat); double gb_gpu_bytes(); using namespace LAMMPS_NS; @@ -77,6 +77,8 @@ PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), avec = (AtomVecEllipsoid *) atom->style_match("ellipsoid"); if (!avec) error->all("Pair gayberne requires atom style ellipsoid"); + quat_nmax = 0; + quat = NULL; } /* ---------------------------------------------------------------------- @@ -87,14 +89,13 @@ PairGayBerneGPU::~PairGayBerneGPU() { gb_gpu_clear(); cpu_time = 0.0; + memory->destroy(quat); } /* ---------------------------------------------------------------------- */ void PairGayBerneGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -102,34 +103,47 @@ void PairGayBerneGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; + int *ilist, *numneigh, **firstneigh; + + if (nall > quat_nmax) { + quat_nmax = static_cast(1.1 * nall); + memory->grow(quat, quat_nmax, 4, "pair:quat"); + } + AtomVecEllipsoid::Bonus *bonus = avec->bonus; + int *ellipsoid = atom->ellipsoid; + for (int i=0; i -1) { + quat[i][0] = bonus[qi].quat[0]; + quat[i][1] = bonus[qi].quat[1]; + quat[i][2] = bonus[qi].quat[2]; + quat[i][3] = bonus[qi].quat[3]; + } + } if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - /* MIKE: this arg of atom->quat needs to be modified - gpulist = gb_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, domain->subhi, - eflag, vflag, eflag_atom, vflag_atom, host_start, - cpu_time, success, atom->quat); - */ + firstneigh = gb_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, quat); } else { inum = list->inum; - /* MIKE: this arg of atom->quat needs to be modified - olist = gb_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, - list->firstneigh, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, - atom->quat); - */ + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + olist = gb_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, + eflag_atom, vflag_atom, host_start, + cpu_time, success, quat); } if (!success) error->one("Out of memory on GPGPU"); if (host_start < inum) { cpu_time = MPI_Wtime(); - if (gpu_mode == GPU_NEIGH) - cpu_compute(gpulist,host_start,eflag,vflag); - else - cpu_compute(host_start,eflag,vflag); + cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_time = MPI_Wtime() - cpu_time; } } @@ -140,8 +154,8 @@ void PairGayBerneGPU::compute(int eflag, int vflag) void PairGayBerneGPU::init_style() { - if (force->pair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU Gay-Berne pair style"); if (!atom->ellipsoid_flag) error->all("Pair gayberne requires atom style ellipsoid"); @@ -179,22 +193,20 @@ void PairGayBerneGPU::init_style() double cell_size = sqrt(maxcut) + neighbor->skin; - bool init_ok = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, - shape1, well, cutsq, sigma, epsilon, lshape, form, - lj1, lj2, lj3, lj4, offset, force->special_lj, - atom->nlocal, atom->nlocal+atom->nghost, 300, - cell_size, gpu_mode, screen); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu)."); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU Gay-Berne pair style"); + int success = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, + shape2, well, cutsq, sigma, epsilon, lshape, form, + lj1, lj2, lj3, lj4, offset, force->special_lj, + atom->nlocal, atom->nlocal+atom->nghost, 300, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } + quat_nmax = static_cast(1.1 * (atom->nlocal + atom->nghost)); + memory->grow(quat, quat_nmax, 4, "pair:quat"); } /* ---------------------------------------------------------------------- */ @@ -202,18 +214,19 @@ void PairGayBerneGPU::init_style() double PairGayBerneGPU::memory_usage() { double bytes = Pair::memory_usage(); - return bytes + gb_gpu_bytes(); + return bytes + memory->usage(quat,quat_nmax)+gb_gpu_bytes(); } /* ---------------------------------------------------------------------- */ -void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag) +void PairGayBerneGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, int **firstneigh) { - int i,j,ii,jj,inum,jnum,itype,jtype; + int i,j,ii,jj,jnum,itype,jtype; double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj; double fforce[3],ttor[3],rtor[3],r12[3]; double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3]; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double *iquat,*jquat; AtomVecEllipsoid::Bonus *bonus = avec->bonus; @@ -225,11 +238,6 @@ void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag) int nlocal = atom->nlocal; double *special_lj = force->special_lj; - inum = list->inum; - ilist = olist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -331,143 +339,3 @@ void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag) } } } - -/* ---------------------------------------------------------------------- */ - -void PairGayBerneGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) -{ - int i,j,itype,jtype; - double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj; - double fforce[3],ttor[3],rtor[3],r12[3]; - double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3]; - double *iquat,*jquat; - - AtomVecEllipsoid::Bonus *bonus = avec->bonus; - int *ellipsoid = atom->ellipsoid; - double **x = atom->x; - double **f = atom->f; - double **tor = atom->torque; - int *type = atom->type; - int nlocal = atom->nlocal; - int stride = nlocal-start; - double *special_lj = force->special_lj; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - itype = type[i]; - - if (form[itype][itype] == ELLIPSE_ELLIPSE) { - iquat = bonus[ellipsoid[j]].quat; - MathExtra::quat_to_mat_trans(iquat,a1); - MathExtra::diag_times3(well[itype],a1,temp); - MathExtra::transpose_times3(a1,temp,b1); - MathExtra::diag_times3(shape2[itype],a1,temp); - MathExtra::transpose_times3(a1,temp,g1); - } - - int *nbor = nbors+i-start; - int jnum =* nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for ( ; nbor < nbor_end; nbor += stride) { - j = *nbor; - factor_lj = special_lj[sbmask(j)]; - j &= NEIGHMASK; - - // r12 = center to center vector - - r12[0] = x[j][0]-x[i][0]; - r12[1] = x[j][1]-x[i][1]; - r12[2] = x[j][2]-x[i][2]; - rsq = MathExtra::dot3(r12,r12); - jtype = type[j]; - - // compute if less than cutoff - - if (rsq < cutsq[itype][jtype]) { - - switch (form[itype][jtype]) { - case SPHERE_SPHERE: - r2inv = 1.0/rsq; - r6inv = r2inv*r2inv*r2inv; - forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); - forcelj *= -r2inv; - if (eflag) one_eng = - r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) - - offset[itype][jtype]; - fforce[0] = r12[0]*forcelj; - fforce[1] = r12[1]*forcelj; - fforce[2] = r12[2]*forcelj; - ttor[0] = ttor[1] = ttor[2] = 0.0; - rtor[0] = rtor[1] = rtor[2] = 0.0; - break; - - case SPHERE_ELLIPSE: - jquat = bonus[ellipsoid[j]].quat; - MathExtra::quat_to_mat_trans(jquat,a2); - MathExtra::diag_times3(well[jtype],a2,temp); - MathExtra::transpose_times3(a2,temp,b2); - MathExtra::diag_times3(shape2[jtype],a2,temp); - MathExtra::transpose_times3(a2,temp,g2); - one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor); - ttor[0] = ttor[1] = ttor[2] = 0.0; - break; - - case ELLIPSE_SPHERE: - one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor); - rtor[0] = rtor[1] = rtor[2] = 0.0; - break; - - default: - jquat = bonus[ellipsoid[j]].quat; - MathExtra::quat_to_mat_trans(jquat,a2); - MathExtra::diag_times3(well[jtype],a2,temp); - MathExtra::transpose_times3(a2,temp,b2); - MathExtra::diag_times3(shape2[jtype],a2,temp); - MathExtra::transpose_times3(a2,temp,g2); - one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq, - fforce,ttor,rtor); - break; - } - - fforce[0] *= factor_lj; - fforce[1] *= factor_lj; - fforce[2] *= factor_lj; - ttor[0] *= factor_lj; - ttor[1] *= factor_lj; - ttor[2] *= factor_lj; - - f[i][0] += fforce[0]; - f[i][1] += fforce[1]; - f[i][2] += fforce[2]; - tor[i][0] += ttor[0]; - tor[i][1] += ttor[1]; - tor[i][2] += ttor[2]; - - if (eflag) evdwl = factor_lj*one_eng; - - if (j (b) ? (a) : (b)) // External functions from cuda library for atom decomposition -bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen); +int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); void lj96_gpu_clear(); -int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); -void lj96_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success); +int ** lj96_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success); +void lj96_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); double lj96_gpu_bytes(); using namespace LAMMPS_NS; @@ -83,8 +84,6 @@ PairLJ96CutGPU::~PairLJ96CutGPU() void PairLJ96CutGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -92,30 +91,30 @@ void PairLJ96CutGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = lj96_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success); + firstneigh = lj96_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success); } else { inum = list->inum; - lj96_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + lj96_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); } if (!success) error->one("Out of memory on GPGPU"); if (host_startpair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU LJ96 pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -151,15 +150,11 @@ void PairLJ96CutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, - offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU LJ96 pair style"); + int success = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -178,11 +173,13 @@ double PairLJ96CutGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) { - int i,j,ii,jj,inum,jnum,itype,jtype; +void PairLJ96CutGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, int **firstneigh) +{ + int i,j,ii,jj,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double **x = atom->x; double **f = atom->f; @@ -190,11 +187,6 @@ void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) { int nlocal = atom->nlocal; double *special_lj = force->special_lj; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -239,73 +231,3 @@ void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) { } } } - -/* ---------------------------------------------------------------------- */ - -void PairLJ96CutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) { - int i,j,itype,jtype; - int nlocal = atom->nlocal; - int stride = nlocal-start; - double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; - double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj; - double *special_lj = force->special_lj; - - double **x = atom->x; - double **f = atom->f; - int *type = atom->type; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - int jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor (b) ? (a) : (b)) @@ -49,35 +50,35 @@ // External functions from cuda library for atom decomposition -bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald, const double cut_lj_innersq, - const double denom_lj, double **epsilon, double **sigma, - const bool mix_arithmetic); +int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald, const double cut_lj_innersq, + const double denom_lj, double **epsilon, double **sigma, + const bool mix_arithmetic); void crml_gpu_clear(); -int * crml_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q); -void crml_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q); +int ** crml_gpu_compute_n(const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, + double *boxlo, double *prd); +void crml_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd); double crml_gpu_bytes(); using namespace LAMMPS_NS; -enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER}; - /* ---------------------------------------------------------------------- */ PairLJCharmmCoulLongGPU::PairLJCharmmCoulLongGPU(LAMMPS *lmp) : @@ -100,8 +101,6 @@ PairLJCharmmCoulLongGPU::~PairLJCharmmCoulLongGPU() void PairLJCharmmCoulLongGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -109,31 +108,32 @@ void PairLJCharmmCoulLongGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = crml_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, - atom->q); + firstneigh = crml_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd); } else { inum = list->inum; - crml_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success, atom->q); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + crml_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one("Out of memory on GPGPU"); if (host_startq_flag) error->all("Pair style lj/charmm/coul/long requires atom attribute q"); - if (force->pair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU CHARMM pair style"); // Repeat cutsq calculation because done after call to init_style double cut; @@ -183,18 +183,24 @@ void PairLJCharmmCoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = crml_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4, - offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen, cut_ljsq, - cut_coulsq, force->special_coul, force->qqrd2e, - g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma, - mix_flag == ARITHMETIC); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - if (force->newton_pair) - error->all("Cannot use newton pair with GPU CHARMM pair style"); + bool arithmetic = true; + for (int i = 1; i < atom->ntypes + 1; i++) + for (int j = i + 1; j < atom->ntypes + 1; j++) { + if (epsilon[i][j] != sqrt(epsilon[i][i] * epsilon[j][j])) + arithmetic = false; + if (sigma[i][j] != 0.5 * (sigma[i][i] + sigma[j][j])) + arithmetic = false; + } + + int success = crml_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen, cut_ljsq, + cut_coulsq, force->special_coul, force->qqrd2e, + g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma, + arithmetic); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -213,15 +219,17 @@ double PairLJCharmmCoulLongGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairLJCharmmCoulLongGPU::cpu_compute(int start, int eflag, int vflag) +void PairLJCharmmCoulLongGPU::cpu_compute(int start, int inum, int eflag, + int vflag, int *ilist, + int *numneigh, int **firstneigh) { - int i,j,ii,jj,inum,jnum,itype,jtype,itable; + int i,j,ii,jj,jnum,itype,jtype,itable; double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; double fraction,table; double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; double grij,expm2,prefactor,t,erfc; double philj,switch1,switch2; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double rsq; evdwl = ecoul = 0.0; @@ -235,11 +243,6 @@ void PairLJCharmmCoulLongGPU::cpu_compute(int start, int eflag, int vflag) double *special_lj = force->special_lj; double qqrd2e = force->qqrd2e; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -339,140 +342,3 @@ void PairLJCharmmCoulLongGPU::cpu_compute(int start, int eflag, int vflag) } } } - -/* ---------------------------------------------------------------------- */ - -void PairLJCharmmCoulLongGPU::cpu_compute(int *nbors, int start, int eflag, - int vflag) -{ - int i,j,jnum,itype,jtype,itable; - double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; - double fraction,table; - double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; - double grij,expm2,prefactor,t,erfc; - double philj,switch1,switch2; - double rsq; - - evdwl = ecoul = 0.0; - - double **x = atom->x; - double **f = atom->f; - double *q = atom->q; - int *type = atom->type; - int nlocal = atom->nlocal; - int stride = nlocal - start; - double *special_coul = force->special_coul; - double *special_lj = force->special_lj; - double qqrd2e = force->qqrd2e; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - qtmp = q[i]; - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor>= ncoulshiftbits; - fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable]; - table = ftable[itable] + fraction*dftable[itable]; - forcecoul = qtmp*q[j] * table; - if (factor_coul < 1.0) { - table = ctable[itable] + fraction*dctable[itable]; - prefactor = qtmp*q[j] * table; - forcecoul -= (1.0-factor_coul)*prefactor; - } - } - } else forcecoul = 0.0; - - if (rsq < cut_ljsq) { - r6inv = r2inv*r2inv*r2inv; - jtype = type[j]; - forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); - if (rsq > cut_lj_innersq) { - switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) * - (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj; - switch2 = 12.0*rsq * (cut_ljsq-rsq) * - (rsq-cut_lj_innersq) / denom_lj; - philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]); - forcelj = forcelj*switch1 + philj*switch2; - } - } else forcelj = 0.0; - - fpair = (forcecoul + factor_lj*forcelj) * r2inv; - - f[i][0] += delx*fpair; - f[i][1] += dely*fpair; - f[i][2] += delz*fpair; - - if (eflag) { - if (rsq < cut_coulsq) { - if (!ncoultablebits || rsq <= tabinnersq) - ecoul = prefactor*erfc; - else { - table = etable[itable] + fraction*detable[itable]; - ecoul = qtmp*q[j] * table; - } - if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; - } else ecoul = 0.0; - - if (rsq < cut_ljsq) { - evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]); - if (rsq > cut_lj_innersq) { - switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) * - (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj; - evdwl *= switch1; - } - evdwl *= factor_lj; - } else evdwl = 0.0; - } - - if (j (b) ? (a) : (b)) // External functions from cuda library for atom decomposition -bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double **host_cut_coulsq, - double *host_special_coul, const double qqrd2e); +int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e); void ljc_gpu_clear(); -int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q); -void ljc_gpu_compute(const int timestep, const int ago, const int inum, +int ** ljc_gpu_compute_n(const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd); +void ljc_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q); + bool &success, double *host_q, const int nlocal, + double *boxlo, double *prd); double ljc_gpu_bytes(); using namespace LAMMPS_NS; @@ -85,8 +89,6 @@ PairLJCutCoulCutGPU::~PairLJCutCoulCutGPU() void PairLJCutCoulCutGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -94,31 +96,32 @@ void PairLJCutCoulCutGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = ljc_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, - atom->q); + firstneigh = ljc_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd); } else { inum = list->inum; - ljc_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success, atom->q); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + ljc_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one("Out of memory on GPGPU"); if (host_startq_flag) error->all("Pair style lj/cut/coul/cut requires atom attribute q"); - if (force->pair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + + if (force->newton_pair) + error->all("Cannot use newton pair with GPU LJ pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -154,16 +158,12 @@ void PairLJCutCoulCutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, - offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, - force->special_coul, force->qqrd2e); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU LJ pair style"); + int success = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, + force->special_coul, force->qqrd2e); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -182,12 +182,14 @@ double PairLJCutCoulCutGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag) +void PairLJCutCoulCutGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, + int **firstneigh) { - int i,j,ii,jj,inum,jnum,itype,jtype; + int i,j,ii,jj,jnum,itype,jtype; double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; evdwl = ecoul = 0.0; @@ -201,11 +203,6 @@ void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag) int newton_pair = force->newton_pair; double qqrd2e = force->qqrd2e; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -264,94 +261,3 @@ void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag) } } } - -/* ---------------------------------------------------------------------- */ - -void PairLJCutCoulCutGPU::cpu_compute(int *nbors, int start, int eflag, - int vflag) -{ - int i,j,jnum,itype,jtype; - double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; - double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; - - evdwl = ecoul = 0.0; - - double **x = atom->x; - double **f = atom->f; - double *q = atom->q; - int *type = atom->type; - int nlocal = atom->nlocal; - int stride = nlocal-start; - double *special_coul = force->special_coul; - double *special_lj = force->special_lj; - double qqrd2e = force->qqrd2e; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - qtmp = q[i]; - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor (b) ? (a) : (b)) @@ -49,27 +50,29 @@ // External functions from cuda library for atom decomposition -bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald); +int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald); void ljcl_gpu_clear(); -int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q); -void ljcl_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q); +int ** ljcl_gpu_compute_n(const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, + double *boxlo, double *prd); +void ljcl_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd); double ljcl_gpu_bytes(); using namespace LAMMPS_NS; @@ -96,8 +99,6 @@ PairLJCutCoulLongGPU::~PairLJCutCoulLongGPU() void PairLJCutCoulLongGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -105,31 +106,32 @@ void PairLJCutCoulLongGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = ljcl_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, - atom->q); + firstneigh = ljcl_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd); } else { inum = list->inum; - ljcl_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success, atom->q); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + ljcl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one("Out of memory on GPGPU"); if (host_startq_flag) error->all("Pair style lj/cut/coul/cut requires atom attribute q"); - if (force->pair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU LJ pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -179,16 +181,12 @@ void PairLJCutCoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + int success = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, atom->nlocal+atom->nghost, 300, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU LJ pair style"); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -207,14 +205,16 @@ double PairLJCutCoulLongGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag) +void PairLJCutCoulLongGPU::cpu_compute(int start, int inum, int eflag, + int vflag, int *ilist, int *numneigh, + int **firstneigh) { - int i,j,ii,jj,inum,jnum,itype,jtype,itable; + int i,j,ii,jj,jnum,itype,jtype,itable; double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; double fraction,table; double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; double grij,expm2,prefactor,t,erfc; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double rsq; evdwl = ecoul = 0.0; @@ -228,11 +228,6 @@ void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag) double *special_lj = force->special_lj; double qqrd2e = force->qqrd2e; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -320,127 +315,3 @@ void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag) } } } - -/* ---------------------------------------------------------------------- */ - -void PairLJCutCoulLongGPU::cpu_compute(int *nbors, int start, int eflag, - int vflag) -{ - int i,j,jnum,itype,jtype,itable; - double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; - double fraction,table; - double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; - double grij,expm2,prefactor,t,erfc; - double rsq; - - evdwl = ecoul = 0.0; - - double **x = atom->x; - double **f = atom->f; - double *q = atom->q; - int *type = atom->type; - int nlocal = atom->nlocal; - int stride = nlocal-start; - double *special_coul = force->special_coul; - double *special_lj = force->special_lj; - double qqrd2e = force->qqrd2e; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - qtmp = q[i]; - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor>= ncoulshiftbits; - fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable]; - table = ftable[itable] + fraction*dftable[itable]; - forcecoul = qtmp*q[j] * table; - if (factor_coul < 1.0) { - table = ctable[itable] + fraction*dctable[itable]; - prefactor = qtmp*q[j] * table; - forcecoul -= (1.0-factor_coul)*prefactor; - } - } - } else forcecoul = 0.0; - - if (rsq < cut_ljsq[itype][jtype]) { - r6inv = r2inv*r2inv*r2inv; - forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); - } else forcelj = 0.0; - - fpair = (forcecoul + factor_lj*forcelj) * r2inv; - - f[i][0] += delx*fpair; - f[i][1] += dely*fpair; - f[i][2] += delz*fpair; - - if (eflag) { - if (rsq < cut_coulsq) { - if (!ncoultablebits || rsq <= tabinnersq) - ecoul = prefactor*erfc; - else { - table = etable[itable] + fraction*detable[itable]; - ecoul = qtmp*q[j] * table; - } - if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; - } else ecoul = 0.0; - - if (rsq < cut_ljsq[itype][jtype]) { - evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) - - offset[itype][jtype]; - evdwl *= factor_lj; - } else evdwl = 0.0; - } - - if (j (b) ? (a) : (b)) // External functions from cuda library for atom decomposition -bool ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen); +int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); void ljl_gpu_clear(); -int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); -void ljl_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success); +int ** ljl_gpu_compute_n(const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success); +void ljl_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); double ljl_gpu_bytes(); using namespace LAMMPS_NS; @@ -83,8 +84,6 @@ PairLJCutGPU::~PairLJCutGPU() void PairLJCutGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -92,30 +91,30 @@ void PairLJCutGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = ljl_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success); + firstneigh = ljl_gpu_compute_n(neighbor->ago, inum, nall, + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, + &ilist, &numneigh, cpu_time, success); } else { inum = list->inum; - ljl_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + ljl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); } if (!success) error->one("Out of memory on GPGPU"); if (host_startpair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU LJ pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -151,15 +150,11 @@ void PairLJCutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, - offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU LJ pair style"); + int success = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -178,11 +173,12 @@ double PairLJCutGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) { - int i,j,ii,jj,inum,jnum,itype,jtype; +void PairLJCutGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, int **firstneigh) { + int i,j,ii,jj,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,r2inv,r6inv,forcelj,factor_lj; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double **x = atom->x; double **f = atom->f; @@ -190,11 +186,6 @@ void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) { int nlocal = atom->nlocal; double *special_lj = force->special_lj; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -238,73 +229,3 @@ void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) { } } } - -/* ---------------------------------------------------------------------- */ - -void PairLJCutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) { - int i,j,itype,jtype; - int nlocal = atom->nlocal; - int stride = nlocal-start; - double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; - double rsq,r2inv,r6inv,forcelj,factor_lj; - double *special_lj = force->special_lj; - - double **x = atom->x; - double **f = atom->f; - int *type = atom->type; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - int jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor Date: Mon, 2 May 2011 15:05:13 +0000 Subject: [PATCH 20/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6056 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/pair_hybrid.cpp | 4 ---- src/pair_lj_expand.h | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp index 786eb674fd..a3d0dafd32 100644 --- a/src/pair_hybrid.cpp +++ b/src/pair_hybrid.cpp @@ -199,7 +199,6 @@ void PairHybrid::settings(int narg, char **arg) // exception is 1st arg of style "table", which is non-numeric word // exception is 1st two args of style "lj/coul", which are non-numeric // exception is 1st two args of style "buck/coul", which are non-numeric - // exception is 1st arg of any "gpu" style, which is non-numeric // exception is 1st arg of reax/c style, which is non-numeric // need a better way to skip these exceptions @@ -209,7 +208,6 @@ void PairHybrid::settings(int narg, char **arg) if (strcmp(arg[i],"table") == 0) i++; if (strcmp(arg[i],"lj/coul") == 0) i += 2; if (strcmp(arg[i],"buck/coul") == 0) i += 2; - if (strstr(arg[i],"gpu")) i++; if (strcmp(arg[i],"reax/c") == 0) i++; i++; while (i < narg && !isalpha(arg[i][0])) i++; @@ -226,7 +224,6 @@ void PairHybrid::settings(int narg, char **arg) // exception is 1st arg of style "table", which is non-numeric // exception is 1st two args of style "lj/coul", which are non-numeric // exception is 1st two args of style "buck/coul", which are non-numeric - // exception is 1st arg of any "gpu" style, which is non-numeric // exception is 1st arg of reax/c style, which is non-numeric // need a better way to skip these exceptions @@ -247,7 +244,6 @@ void PairHybrid::settings(int narg, char **arg) if (strcmp(arg[i],"table") == 0) i++; if (strcmp(arg[i],"lj/coul") == 0) i += 2; if (strcmp(arg[i],"buck/coul") == 0) i += 2; - if (strstr(arg[i],"gpu")) i++; if (strcmp(arg[i],"reax/c") == 0) i++; i++; while (i < narg && !isalpha(arg[i][0])) i++; diff --git a/src/pair_lj_expand.h b/src/pair_lj_expand.h index fa6b136c67..1d1b10c315 100644 --- a/src/pair_lj_expand.h +++ b/src/pair_lj_expand.h @@ -38,7 +38,7 @@ class PairLJExpand : public Pair { void read_restart_settings(FILE *); double single(int, int, int, int, double, double, double, double &); - private: + protected: double cut_global; double **cut; double **epsilon,**sigma,**shift; From 0add57d01eef748c864c0de0394cf817f4f847ff Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 2 May 2011 15:10:37 +0000 Subject: [PATCH 21/21] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6058 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index b7cd4f016f..382d330033 100644 --- a/src/version.h +++ b/src/version.h @@ -1 +1 @@ -#define LAMMPS_VERSION "29 Apr 2011" +#define LAMMPS_VERSION "2 May 2011"
      pppm GPU single and double Mike Brown (ORNL)
      pair_style lj/cut/expand Inderaj Bains (NVIDIA)
      temperature accelerated dynamics (TAD) Aidan Thompson (Sandia)
      pair reax/c and fix qeq/reax Metin Aktulga (Purdue, now LBNL)
      DREIDING force field, pair_style hbond/dreiding, etc Tod Pascal (CalTech)
      fix adapt and compute ti for thermodynamic integreation for free energies Sai Jayaraman (Sandia)
      pair born and pair gauss Sai Jayaraman (Sandia)
      stochastic rotation dynamics (SRD) via fix srd Jemery Lechman (Sandia) and Pieter in 't Veld (BASF)
      ipp Perl script tool Reese Jones (Sandia)
      eam_database and createatoms tools Xiaowang Zhou (Sandia)
      electron force field (eFF) Andres Jaramillo-Botero and Julius Su (Caltech)