diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 6317ba8d94..f572d3ebd0 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -410,7 +410,7 @@ _texture( q_tex,int2); ------------------------------------------------------------------------- */ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, + const __global numtyp4 *restrict extra, const __global numtyp4 *restrict coeff, const __global numtyp4 *restrict sp_amoeba, const __global int *dev_nbor, @@ -442,10 +442,10 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, acctyp4 tq; tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; - - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + + const __global numtyp4* polar1 = &extra[0]; + const __global numtyp4* polar2 = &extra[nall]; + const __global numtyp4* polar3 = &extra[2*nall]; if (iioff2) continue; - numtyp r = ucl_sqrt(r2); const numtyp4 pol1j = polar1[j]; numtyp ck = pol1j.x; // rpole[j][0]; @@ -583,12 +581,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp rr11 = (numtyp)9.0 * rr9 * r2inv; // calculate the real space Ewald error function terms - + numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[6]; bn[0] = ucl_erfc(ralpha) * rinv; - + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); @@ -691,7 +689,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, ------------------------------------------------------------------------- */ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, + const __global numtyp4 *restrict extra, const __global numtyp4 *restrict coeff, const __global numtyp4 *restrict sp_amoeba, const __global int *dev_nbor, @@ -707,14 +705,14 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); int n_stride; - //local_allocate_store_charge(); + local_allocate_store_ufld(); acctyp _fieldp[6]; for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + const __global numtyp4* polar1 = &extra[0]; + const __global numtyp4* polar2 = &extra[nall]; + const __global numtyp4* polar3 = &extra[2*nall]; if (iioff2) continue; - numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_rsqrt(r2); numtyp r2inv = rinv*rinv; @@ -1049,7 +1046,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, ------------------------------------------------------------------------- */ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, + const __global numtyp4 *restrict extra, const __global numtyp4 *restrict coeff, const __global numtyp4 *restrict sp_amoeba, const __global int *dev_nbor, @@ -1068,7 +1065,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); int n_stride; - local_allocate_store_ufld(); local_allocate_store_charge(); acctyp4 f; @@ -1086,11 +1082,12 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, for (int l=0; l<6; l++) dufld[l]=(acctyp)0; numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); - numtyp4* polar4 = (numtyp4*)(&extra[12*nall]); - numtyp4* polar5 = (numtyp4*)(&extra[16*nall]); + + const __global numtyp4* polar1 = &extra[0]; + const __global numtyp4* polar2 = &extra[nall]; + const __global numtyp4* polar3 = &extra[2*nall]; + const __global numtyp4* polar4 = &extra[3*nall]; + const __global numtyp4* polar5 = &extra[4*nall]; if (ii0) - bytes+=_extra_fields*sizeof(numtyp); + bytes+=_extra_fields*sizeof(numtyp4); return bytes; } diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 4b29d76cb1..771c2a3571 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -516,7 +516,7 @@ class Atom { /// Velocities UCL_Vector v; /// Extras - UCL_Vector extra; + UCL_Vector extra; #ifdef GPU_CAST UCL_Vector x_cast; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index e80fa01c2b..09d7386461 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -90,7 +90,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, bool vel = false; _extra_fields = 24; // round up to accomodate quadruples of numtyp values // rpole 13; uind 3; uinp 3; amtype, amgroup; pval - int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields); + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4); if (success!=0) return success; @@ -820,35 +820,35 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, atom->extra_data_unavail(); int _nall=atom->nall(); - numtyp *pextra=reinterpret_cast(&(atom->extra[0])); + numtyp4 *pextra=reinterpret_cast(&(atom->extra[0])); int n = 0; - int nstride = 4; + int nstride = 1; //4; if (rpole) { for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = rpole[i][0]; - pextra[idx+1] = rpole[i][1]; - pextra[idx+2] = rpole[i][2]; - pextra[idx+3] = rpole[i][3]; + pextra[idx].x = rpole[i][0]; + pextra[idx].y = rpole[i][1]; + pextra[idx].z = rpole[i][2]; + pextra[idx].w = rpole[i][3]; } n += nstride*_nall; for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = rpole[i][4]; - pextra[idx+1] = rpole[i][5]; - pextra[idx+2] = rpole[i][6]; - pextra[idx+3] = rpole[i][8]; + pextra[idx].x = rpole[i][4]; + pextra[idx].y = rpole[i][5]; + pextra[idx].z = rpole[i][6]; + pextra[idx].w = rpole[i][8]; } n += nstride*_nall; for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = rpole[i][9]; - pextra[idx+1] = rpole[i][12]; - pextra[idx+2] = (numtyp)amtype[i]; - pextra[idx+3] = (numtyp)amgroup[i]; + pextra[idx].x = rpole[i][9]; + pextra[idx].y = rpole[i][12]; + pextra[idx].z = (numtyp)amtype[i]; + pextra[idx].w = (numtyp)amgroup[i]; } } else { n += 2*nstride*_nall; @@ -858,9 +858,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, if (uind) { for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = uind[i][0]; - pextra[idx+1] = uind[i][1]; - pextra[idx+2] = uind[i][2]; + pextra[idx].x = uind[i][0]; + pextra[idx].y = uind[i][1]; + pextra[idx].z = uind[i][2]; + pextra[idx].w = 0; } } @@ -868,9 +869,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, if (uinp) { for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = uinp[i][0]; - pextra[idx+1] = uinp[i][1]; - pextra[idx+2] = uinp[i][2]; + pextra[idx].x = uinp[i][0]; + pextra[idx].y = uinp[i][1]; + pextra[idx].z = uinp[i][2]; + pextra[idx].w = 0; } } @@ -878,7 +880,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, if (pval) { for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = pval[i]; + pextra[idx].x = pval[i]; + pextra[idx].y = 0; + pextra[idx].z = 0; + pextra[idx].w = 0; } } } diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 1611e8aece..99e20db223 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -410,7 +410,7 @@ _texture( q_tex,int2); ------------------------------------------------------------------------- */ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, + const __global numtyp4 *restrict extra, const __global numtyp4 *restrict coeff_rep, const __global numtyp4 *restrict sp_nonpolar, const __global int *dev_nbor, @@ -444,9 +444,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, acctyp4 tq; tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + const __global numtyp4* polar1 = &extra[0]; + const __global numtyp4* polar2 = &extra[nall]; + const __global numtyp4* polar3 = &extra[2*nall]; if (iioff2) continue; - const numtyp4 pol1j = polar1[j]; //numtyp ck = pol1j.x; // rpole[j][0]; numtyp dkx = pol1j.y; // rpole[j][1]; @@ -712,7 +710,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, ------------------------------------------------------------------------- */ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, + const __global numtyp4 *restrict extra, const __global numtyp4 *restrict coeff_amtype, const __global numtyp4 *restrict coeff_amclass, const __global numtyp4 *restrict sp_nonpolar, @@ -741,7 +739,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, for (int l=0; l<6; l++) virial[l]=(acctyp)0; } - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + const __global numtyp4* polar3 = &extra[2*nall]; if (ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + int m; for (m = 1; m < 6; m++) { numtyp bfac = (numtyp) (m+m-1); alsq2n = alsq2 * alsq2n; @@ -1208,32 +1203,32 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, ------------------------------------------------------------------------- */ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, + const __global numtyp4 *restrict extra, const __global numtyp4 *restrict coeff_amtype, const __global numtyp4 *restrict coeff_amclass, const __global numtyp4 *restrict sp_polar, const __global int *dev_nbor, - const __global int *dev_packed, - const __global int *dev_short_nbor, - __global acctyp4 *restrict fieldp, - const int inum, const int nall, - const int nbor_pitch, const int t_per_atom, - const numtyp aewald, const numtyp off2, - const numtyp polar_dscale, const numtyp polar_uscale) + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global acctyp4 *restrict fieldp, + const int inum, const int nall, + const int nbor_pitch, const int t_per_atom, + const numtyp aewald, const numtyp off2, + const numtyp polar_dscale, const numtyp polar_uscale) { int tid, ii, offset, i; atom_info(t_per_atom,ii,tid,offset); int n_stride; - //local_allocate_store_charge(); + local_allocate_store_charge(); acctyp _fieldp[6]; for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); - numtyp4* polar6 = (numtyp4*)(&extra[20*nall]); + const __global numtyp4* polar1 = &extra[0]; + const __global numtyp4* polar2 = &extra[nall]; + const __global numtyp4* polar3 = &extra[2*nall]; + const __global numtyp4* polar6 = &extra[5*nall]; if (ii