diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 49c0d78d7f..41185f30e3 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -62,7 +62,7 @@ _texture( q_tex,int2); tq.z=red_acc[2][tid]; \ } \ if (offset==0 && ii1) { \ simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ if (EVFLAG && (vflag==2 || eflag==2)) { \ if (eflag) { \ simdsync(); \ - simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ } \ if (vflag) { \ simdsync(); \ @@ -174,7 +174,7 @@ _texture( q_tex,int2); if (eflag!=2 && vflag!=2) { \ if (eflag) { \ simdsync(); \ - block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul); \ + block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul); \ if (vflag) __syncthreads(); \ if (tid==0) { \ engv[ei]+=energy*(acctyp)0.5; \ @@ -225,7 +225,7 @@ _texture( q_tex,int2); } \ } \ if (offset==0 && ii1) { \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (vflag==2 || eflag==2) { \ if (eflag) \ - simd_reduce_add2(t_per_atom,energy,e_coul); \ + simd_reduce_add2(t_per_atom,energy,e_coul); \ if (vflag) \ simd_reduce_arr(6, t_per_atom,virial); \ } \ } \ if (offset==0 && ii1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ @@ -402,20 +400,20 @@ _texture( q_tex,int2); ------------------------------------------------------------------------- */ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, - const __global numtyp4 *restrict damping, - const __global numtyp4 *restrict sp_polar, - const __global int *dev_nbor, - const __global int *dev_packed, - const __global int *dev_short_nbor, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - __global numtyp4 *restrict tep, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, const int t_per_atom, - const numtyp aewald, const numtyp felec, - const numtyp off2, const numtyp polar_dscale, - const numtyp polar_uscale) + const __global numtyp *restrict extra, + const __global numtyp4 *restrict damping, + const __global numtyp4 *restrict sp_polar, + const __global int *dev_nbor, + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + __global numtyp4 *restrict tep, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + const int t_per_atom, const numtyp aewald, + const numtyp felec, const numtyp off2, + const numtyp polar_dscale, const numtyp polar_uscale) { int tid, ii, offset, i; atom_info(t_per_atom,ii,tid,offset); @@ -439,14 +437,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); - //numtyp4 xi__; - if (iioff2) continue; + //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); numtyp ck = polar1[j].x; // rpole[j][0]; @@ -613,14 +605,14 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, for (m = 1; m < 6; m++) { bfac = (numtyp) (m+m-1); alsq2n = alsq2 * alsq2n; - bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; } for (m = 0; m < 6; m++) bn[m] *= felec; term1 = ci*ck; term2 = ck*dir - ci*dkr + dik; - term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk); - term4 = dir*qkr - dkr*qir - 4.0*qik; + term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk); + term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik; term5 = qir*qkr; numtyp scalek = (numtyp)1.0 - factor_mpole; rr1 = bn[0] - scalek*rr1; @@ -730,8 +722,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); - //numtyp4 xi__; - if (iimaxspecial15; } - int tep_size; + int tq_size; int mnf = 5e-2 * neighbor->oneatom; int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp, special_mpole, special_polar_wscale, special_polar_piscale, special_polar_pscale, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, maxspecial15, cell_size, gpu_mode, screen, - polar_dscale, polar_uscale, tep_size); + polar_dscale, polar_uscale, tq_size); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now"); - if (tep_size == sizeof(double)) - tep_single = false; + if (tq_size == sizeof(double)) + tq_single = false; else - tep_single = true; + tq_single = true; } /* ---------------------------------------------------------------------- */ @@ -231,19 +231,19 @@ void PairAmoebaGPU::multipole_real() eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, success, aewald, felec, off2, atom->q, - domain->boxlo, domain->prd, &tep_pinned); + domain->boxlo, domain->prd, &tq_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); // reference to the tep array from GPU lib - printf("compute multipole real\n"); - if (tep_single) { - float *tep_ptr = (float *)tep_pinned; - compute_force_from_tep(tep_ptr, fmpole, virmpole); + + if (tq_single) { + float *tq_ptr = (float *)tq_pinned; + compute_force_from_torque(tq_ptr, fmpole, virmpole); } else { - double *tep_ptr = (double *)tep_pinned; - compute_force_from_tep(tep_ptr, fmpole, virmpole); + double *tq_ptr = (double *)tq_pinned; + compute_force_from_torque(tq_ptr, fmpole, virmpole); } } @@ -681,7 +681,6 @@ void PairAmoebaGPU::induce() } } - /* ---------------------------------------------------------------------- udirect2b = Ewald real direct field via list udirect2b computes the real space contribution of the permanent @@ -721,19 +720,20 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) else choose(POLAR); firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, uind, uinp, - sublo, subhi, atom->tag, atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, off2, atom->q, domain->boxlo, - domain->prd, &fieldp_pinned); + atom->type, amtype, amgroup, rpole, + uind, uinp, sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, off2, atom->q, + domain->boxlo, domain->prd, &fieldp_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); // rebuild dipole-dipole pair list and store pairwise dipole matrices // done one atom at a time in real-space double loop over atoms & neighs - + // NOTE: for the moment the tdipdip values are computed just in time in umutual2b() // udirect2b_cpu(); // accumulate the field and fieldp values from the GPU lib @@ -945,13 +945,14 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) else choose(POLAR); firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, uind, uinp, - sublo, subhi, atom->tag, atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success,aewald, off2, atom->q, domain->boxlo, - domain->prd, &fieldp_pinned); + atom->type, amtype, amgroup, rpole, + uind, uinp, sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success,aewald, off2, atom->q, + domain->boxlo, domain->prd, &fieldp_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -1017,37 +1018,37 @@ void PairAmoebaGPU::polar_real() double felec = 0.5 * electric / am_dielectric; firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, - rpole, uind, uinp, sublo, subhi, - atom->tag, atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, felec, off2, atom->q, domain->boxlo, - domain->prd, &tep_pinned); + atom->type, amtype, amgroup, + rpole, uind, uinp, sublo, subhi, + atom->tag, atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); // reference to the tep array from GPU lib - printf("compute polar real\n"); - if (tep_single) { - float *tep_ptr = (float *)tep_pinned; - compute_force_from_tep(tep_ptr, fpolar, virpolar); + + if (tq_single) { + float *tep_ptr = (float *)tq_pinned; + compute_force_from_torque(tep_ptr, fpolar, virpolar); } else { - double *tep_ptr = (double *)tep_pinned; - compute_force_from_tep(tep_ptr, fpolar, virpolar); + double *tep_ptr = (double *)tq_pinned; + compute_force_from_torque(tep_ptr, fpolar, virpolar); } } /* ---------------------------------------------------------------------- - init specific to this pair style + compute atom forces from torques ------------------------------------------------------------------------- */ template -void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr, - double** force_comp, - double* virial_comp) +void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr, + double** force_comp, + double* virial_comp) { int i,ix,iy,iz; double xix,yix,zix; @@ -1055,19 +1056,16 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr, double xiz,yiz,ziz; double vxx,vyy,vzz; double vxy,vxz,vyz; - double fix[3],fiy[3],fiz[3],tep[4]; + double fix[3],fiy[3],fiz[3],_tq[4]; double** x = atom->x; int nlocal = atom->nlocal; for (i = 0; i < nlocal; i++) { - tep[0] = tep_ptr[4*i]; - tep[1] = tep_ptr[4*i+1]; - tep[2] = tep_ptr[4*i+2]; - - if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]); - torque2force(i,tep,fix,fiy,fiz,force_comp); - if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]); + _tq[0] = tq_ptr[4*i]; + _tq[1] = tq_ptr[4*i+1]; + _tq[2] = tq_ptr[4*i+2]; + torque2force(i,_tq,fix,fiy,fiz,force_comp); iz = zaxis2local[i]; ix = xaxis2local[i]; @@ -1092,7 +1090,7 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr, xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); - //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz); + virial_comp[0] += vxx; virial_comp[1] += vyy; virial_comp[2] += vzz; diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index a913449a62..d9a3fc5904 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -43,9 +43,9 @@ class PairAmoebaGPU : public PairAmoeba { private: int gpu_mode; double cpu_time; - void *tep_pinned; + void *tq_pinned; void *fieldp_pinned; - bool tep_single; + bool tq_single; bool gpu_multipole_real_ready; bool gpu_udirect2b_ready; @@ -55,7 +55,7 @@ class PairAmoebaGPU : public PairAmoeba { void udirect2b_cpu(); template - void compute_force_from_tep(const numtyp*, double**, double*); + void compute_force_from_torque(const numtyp*, double**, double*); }; } // namespace LAMMPS_NS