Cleaned up debugging stuffs and unused variables

This commit is contained in:
Trung Nguyen
2021-09-17 23:13:51 -05:00
parent f5713a52b3
commit 78045d8f76
3 changed files with 89 additions and 101 deletions

View File

@ -62,7 +62,7 @@ _texture( q_tex,int2);
tq.z=red_acc[2][tid]; \
} \
if (offset==0 && ii<inum) { \
tep[i]=tq; \
tep[i]=tq; \
}
#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset, \
@ -147,14 +147,14 @@ _texture( q_tex,int2);
fieldp[ii+inum] = fp; \
}
#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom, \
#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom \
offset, eflag, vflag, ans, engv, ev_stride) \
if (t_per_atom>1) { \
simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \
if (EVFLAG && (vflag==2 || eflag==2)) { \
if (eflag) { \
simdsync(); \
simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
} \
if (vflag) { \
simdsync(); \
@ -174,7 +174,7 @@ _texture( q_tex,int2);
if (eflag!=2 && vflag!=2) { \
if (eflag) { \
simdsync(); \
block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul); \
block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul); \
if (vflag) __syncthreads(); \
if (tid==0) { \
engv[ei]+=energy*(acctyp)0.5; \
@ -225,7 +225,7 @@ _texture( q_tex,int2);
} \
} \
if (offset==0 && ii<inum) { \
tep[i]=tq; \
tep[i]=tq; \
}
#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset, \
@ -280,25 +280,23 @@ _texture( q_tex,int2);
#if (EVFLAG == 1)
#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom, \
#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
offset, eflag, vflag, ans, engv, ev_stride) \
if (t_per_atom>1) { \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (vflag==2 || eflag==2) { \
if (eflag) \
simd_reduce_add2(t_per_atom,energy,e_coul); \
simd_reduce_add2(t_per_atom,energy,e_coul); \
if (vflag) \
simd_reduce_arr(6, t_per_atom,virial); \
} \
} \
if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \
if (ii == 0) printf("old = %f %f %f\n", old.x, old.y, old.z); \
old.x+=f.x; \
old.y+=f.y; \
old.z+=f.z; \
ans[ii]=old; \
if (ii == 0) printf("new = %f %f %f\n", old.x, old.y, old.z); \
} \
if (eflag || vflag) { \
if (eflag!=2 && vflag!=2) { \
@ -378,7 +376,7 @@ _texture( q_tex,int2);
// EVFLAG == 0
#else
#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom, \
#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
offset, eflag, vflag, ans, engv, ev_stride) \
if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
@ -402,20 +400,20 @@ _texture( q_tex,int2);
------------------------------------------------------------------------- */
__kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
const __global numtyp *restrict extra,
const __global numtyp4 *restrict damping,
const __global numtyp4 *restrict sp_polar,
const __global int *dev_nbor,
const __global int *dev_packed,
const __global int *dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global numtyp4 *restrict tep,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch, const int t_per_atom,
const numtyp aewald, const numtyp felec,
const numtyp off2, const numtyp polar_dscale,
const numtyp polar_uscale)
const __global numtyp *restrict extra,
const __global numtyp4 *restrict damping,
const __global numtyp4 *restrict sp_polar,
const __global int *dev_nbor,
const __global int *dev_packed,
const __global int *dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global numtyp4 *restrict tep,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
const int t_per_atom, const numtyp aewald,
const numtyp felec, const numtyp off2,
const numtyp polar_dscale, const numtyp polar_uscale)
{
int tid, ii, offset, i;
atom_info(t_per_atom,ii,tid,offset);
@ -439,14 +437,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
//numtyp4 xi__;
if (ii<inum) {
int k,m,itype,igroup;
int m,itype,igroup;
numtyp bfac;
numtyp term1,term2,term3;
numtyp term4,term5;
numtyp term6;
numtyp term4,term5,term6;
numtyp bn[6];
numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
@ -480,9 +475,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
itype = polar3[i].z; // amtype[i];
igroup = polar3[i].w; // amgroup[i];
// debug:
// xi__ = ix; xi__.w = itype;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int jextra=nbor_mem[nbor];
@ -497,7 +489,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
numtyp zr = jx.z - ix.z;
numtyp r2 = xr*xr + yr*yr + zr*zr;
if (r2>off2) continue;
//if (r2>off2) continue;
numtyp r = ucl_sqrt(r2);
numtyp ck = polar1[j].x; // rpole[j][0];
@ -613,14 +605,14 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
for (m = 1; m < 6; m++) {
bfac = (numtyp) (m+m-1);
alsq2n = alsq2 * alsq2n;
bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
}
for (m = 0; m < 6; m++) bn[m] *= felec;
term1 = ci*ck;
term2 = ck*dir - ci*dkr + dik;
term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
term4 = dir*qkr - dkr*qir - 4.0*qik;
term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik;
term5 = qir*qkr;
numtyp scalek = (numtyp)1.0 - factor_mpole;
rr1 = bn[0] - scalek*rr1;
@ -730,8 +722,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
//numtyp4 xi__;
if (ii<inum) {
int numj, nbor, nbor_end;
const __global int* nbor_mem=dev_packed;
@ -1337,7 +1327,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
for (m = 1; m <= 4; m++) {
bfac = (numtyp) (m+m-1);
alsq2n = alsq2 * alsq2n;
bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
}
for (m = 0; m < 5; m++) bn[m] *= felec;

View File

@ -60,7 +60,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, int &gpu_mode, FILE *screen,
const double polar_dscale, const double polar_uscale, int& tep_size);
const double polar_dscale, const double polar_uscale, int& tq_size);
void amoeba_gpu_clear();
int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
@ -70,7 +70,7 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum, const double cpu_time,
bool &success, const double aewald, const double felec, const double off2,
double *host_q, double *boxlo, double *prd, void **tep_ptr);
double *host_q, double *boxlo, double *prd, void **tq_ptr);
int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
@ -100,7 +100,7 @@ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int na
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum, const double cpu_time,
bool &success, const double aewald, const double felec, const double off2,
double *host_q, double *boxlo, double *prd, void **tep_ptr);
double *host_q, double *boxlo, double *prd, void **tq_ptr);
double amoeba_gpu_bytes();
@ -113,7 +113,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
cpu_time = 0.0;
suffix_flag |= Suffix::GPU;
fieldp_pinned = nullptr;
tep_pinned = nullptr;
tq_pinned = nullptr;
gpu_multipole_real_ready = true;
gpu_udirect2b_ready = true;
@ -166,23 +166,23 @@ void PairAmoebaGPU::init_style()
maxspecial15=atom->maxspecial15;
}
int tep_size;
int tq_size;
int mnf = 5e-2 * neighbor->oneatom;
int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp,
special_mpole, special_polar_wscale, special_polar_piscale,
special_polar_pscale, atom->nlocal,
atom->nlocal+atom->nghost, mnf, maxspecial,
maxspecial15, cell_size, gpu_mode, screen,
polar_dscale, polar_uscale, tep_size);
polar_dscale, polar_uscale, tq_size);
GPU_EXTRA::check_flag(success,error,world);
if (gpu_mode == GPU_FORCE)
error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now");
if (tep_size == sizeof(double))
tep_single = false;
if (tq_size == sizeof(double))
tq_single = false;
else
tep_single = true;
tq_single = true;
}
/* ---------------------------------------------------------------------- */
@ -231,19 +231,19 @@ void PairAmoebaGPU::multipole_real()
eflag, vflag, eflag_atom, vflag_atom,
host_start, &ilist, &numneigh, cpu_time,
success, aewald, felec, off2, atom->q,
domain->boxlo, domain->prd, &tep_pinned);
domain->boxlo, domain->prd, &tq_pinned);
if (!success)
error->one(FLERR,"Insufficient memory on accelerator");
// reference to the tep array from GPU lib
printf("compute multipole real\n");
if (tep_single) {
float *tep_ptr = (float *)tep_pinned;
compute_force_from_tep<float>(tep_ptr, fmpole, virmpole);
if (tq_single) {
float *tq_ptr = (float *)tq_pinned;
compute_force_from_torque<float>(tq_ptr, fmpole, virmpole);
} else {
double *tep_ptr = (double *)tep_pinned;
compute_force_from_tep<double>(tep_ptr, fmpole, virmpole);
double *tq_ptr = (double *)tq_pinned;
compute_force_from_torque<double>(tq_ptr, fmpole, virmpole);
}
}
@ -681,7 +681,6 @@ void PairAmoebaGPU::induce()
}
}
/* ----------------------------------------------------------------------
udirect2b = Ewald real direct field via list
udirect2b computes the real space contribution of the permanent
@ -721,19 +720,20 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
else choose(POLAR);
firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x,
atom->type, amtype, amgroup, rpole, uind, uinp,
sublo, subhi, atom->tag, atom->nspecial, atom->special,
atom->nspecial15, atom->special15,
eflag, vflag, eflag_atom, vflag_atom,
host_start, &ilist, &numneigh, cpu_time,
success, aewald, off2, atom->q, domain->boxlo,
domain->prd, &fieldp_pinned);
atom->type, amtype, amgroup, rpole,
uind, uinp, sublo, subhi, atom->tag,
atom->nspecial, atom->special,
atom->nspecial15, atom->special15,
eflag, vflag, eflag_atom, vflag_atom,
host_start, &ilist, &numneigh, cpu_time,
success, aewald, off2, atom->q,
domain->boxlo, domain->prd, &fieldp_pinned);
if (!success)
error->one(FLERR,"Insufficient memory on accelerator");
// rebuild dipole-dipole pair list and store pairwise dipole matrices
// done one atom at a time in real-space double loop over atoms & neighs
// NOTE: for the moment the tdipdip values are computed just in time in umutual2b()
// udirect2b_cpu();
// accumulate the field and fieldp values from the GPU lib
@ -945,13 +945,14 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
else choose(POLAR);
firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x,
atom->type, amtype, amgroup, rpole, uind, uinp,
sublo, subhi, atom->tag, atom->nspecial, atom->special,
atom->nspecial15, atom->special15,
eflag, vflag, eflag_atom, vflag_atom,
host_start, &ilist, &numneigh, cpu_time,
success,aewald, off2, atom->q, domain->boxlo,
domain->prd, &fieldp_pinned);
atom->type, amtype, amgroup, rpole,
uind, uinp, sublo, subhi, atom->tag,
atom->nspecial, atom->special,
atom->nspecial15, atom->special15,
eflag, vflag, eflag_atom, vflag_atom,
host_start, &ilist, &numneigh, cpu_time,
success,aewald, off2, atom->q,
domain->boxlo, domain->prd, &fieldp_pinned);
if (!success)
error->one(FLERR,"Insufficient memory on accelerator");
@ -1017,37 +1018,37 @@ void PairAmoebaGPU::polar_real()
double felec = 0.5 * electric / am_dielectric;
firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
atom->type, amtype, amgroup,
rpole, uind, uinp, sublo, subhi,
atom->tag, atom->nspecial, atom->special,
atom->nspecial15, atom->special15,
eflag, vflag, eflag_atom, vflag_atom,
host_start, &ilist, &numneigh, cpu_time,
success, aewald, felec, off2, atom->q, domain->boxlo,
domain->prd, &tep_pinned);
atom->type, amtype, amgroup,
rpole, uind, uinp, sublo, subhi,
atom->tag, atom->nspecial, atom->special,
atom->nspecial15, atom->special15,
eflag, vflag, eflag_atom, vflag_atom,
host_start, &ilist, &numneigh, cpu_time,
success, aewald, felec, off2, atom->q,
domain->boxlo, domain->prd, &tq_pinned);
if (!success)
error->one(FLERR,"Insufficient memory on accelerator");
// reference to the tep array from GPU lib
printf("compute polar real\n");
if (tep_single) {
float *tep_ptr = (float *)tep_pinned;
compute_force_from_tep<float>(tep_ptr, fpolar, virpolar);
if (tq_single) {
float *tep_ptr = (float *)tq_pinned;
compute_force_from_torque<float>(tep_ptr, fpolar, virpolar);
} else {
double *tep_ptr = (double *)tep_pinned;
compute_force_from_tep<double>(tep_ptr, fpolar, virpolar);
double *tep_ptr = (double *)tq_pinned;
compute_force_from_torque<double>(tep_ptr, fpolar, virpolar);
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
compute atom forces from torques
------------------------------------------------------------------------- */
template <class numtyp>
void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
double** force_comp,
double* virial_comp)
void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr,
double** force_comp,
double* virial_comp)
{
int i,ix,iy,iz;
double xix,yix,zix;
@ -1055,19 +1056,16 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
double xiz,yiz,ziz;
double vxx,vyy,vzz;
double vxy,vxz,vyz;
double fix[3],fiy[3],fiz[3],tep[4];
double fix[3],fiy[3],fiz[3],_tq[4];
double** x = atom->x;
int nlocal = atom->nlocal;
for (i = 0; i < nlocal; i++) {
tep[0] = tep_ptr[4*i];
tep[1] = tep_ptr[4*i+1];
tep[2] = tep_ptr[4*i+2];
if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
torque2force(i,tep,fix,fiy,fiz,force_comp);
if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
_tq[0] = tq_ptr[4*i];
_tq[1] = tq_ptr[4*i+1];
_tq[2] = tq_ptr[4*i+2];
torque2force(i,_tq,fix,fiy,fiz,force_comp);
iz = zaxis2local[i];
ix = xaxis2local[i];
@ -1092,7 +1090,7 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
//if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
virial_comp[0] += vxx;
virial_comp[1] += vyy;
virial_comp[2] += vzz;

View File

@ -43,9 +43,9 @@ class PairAmoebaGPU : public PairAmoeba {
private:
int gpu_mode;
double cpu_time;
void *tep_pinned;
void *tq_pinned;
void *fieldp_pinned;
bool tep_single;
bool tq_single;
bool gpu_multipole_real_ready;
bool gpu_udirect2b_ready;
@ -55,7 +55,7 @@ class PairAmoebaGPU : public PairAmoeba {
void udirect2b_cpu();
template<class numtyp>
void compute_force_from_tep(const numtyp*, double**, double*);
void compute_force_from_torque(const numtyp*, double**, double*);
};
} // namespace LAMMPS_NS