diff --git a/lib/gpu/lal_sph_lj.cu b/lib/gpu/lal_sph_lj.cu index 4d21c093a8..b965df25a2 100644 --- a/lib/gpu/lal_sph_lj.cu +++ b/lib/gpu/lal_sph_lj.cu @@ -29,41 +29,86 @@ _texture_2d( vel_tex,int4); #if (SHUFFLE_AVAIL == 0) -#define store_drhoE(drhoI, deltaE, ii, inum, tid, t_per_atom, offset, \ - drhoE) \ +#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, drhoE) \ if (t_per_atom>1) { \ simdsync(); \ - simd_reduce_add2(t_per_atom, red_acc, offset, tid, rhoEi, deltaE); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, \ + drhoEacc.x, drhoEacc.y); \ } \ if (offset==0 && ii1) { \ - simd_reduce_add2(t_per_atom,drhoI,deltaE); \ - } \ - if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + drhoEacc.x += shfl_down(drhoEacc.x, s, t_per_atom); \ + drhoEacc.y += shfl_down(drhoEacc.y, s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii (numtyp)0.0) { + pc[1] = ucl_sqrt(csq); // soundspeed + } else { + pc[1] = (numtyp)0.0; + } +} + + __kernel void k_sph_lj(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict extra, - const __global numtyp4 *restrict coeff, - const int lj_types, - const __global numtyp *restrict sp_lj, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp3 *restrict ans, - __global acctyp *restrict engv, - __global acctyp *restrict drhoE, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, - const __global numtyp4 *restrict v_, - const int t_per_atom) { + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + __global acctyp *restrict drhoE, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp4 *restrict v_, + const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -87,7 +132,6 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; - numtyp mass_itype = mass[itype]; numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i]; const numtyp4 extrai = extra[i]; @@ -97,11 +141,12 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, numtyp massi= extrai.w; // compute pressure of particle i with LJ EOS - numtyp fi, ci; - //LJEOS2(rho[i], esph[i], cv[i], &fi, &ci); - //fi /= (rho[i] * rho[i]); + numtyp fci[2]; + LJEOS2(rhoi, esphi, cvi, fci); + numtyp fi = fci[0]; + numtyp ci = fci[1]; + fi /= (rhoi * rhoi); - numtyp factor_lj; for ( ; nbordimension == 3 Lucy Kernel, 3d - wfd = -25.066903536973515383e0 * wfd * wfd * ihsq * ihsq * ihsq * ih; + wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; // Lucy Kernel, 2d //wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; // function call to LJ EOS - numtyp fj, cj; - //LJEOS2(rho[j], esph[j], cv[j], &fj, &cj); - //fj /= (rho[j] * rho[j]); + numtyp fcj[2]; + LJEOS2(rhoj, esphj, cvj, fcj); + numtyp fj = fcj[0]; + numtyp cj = fcj[1]; + fj /= (rhoj * rhoj); // apply long-range correction to model a LJ fluid with cutoff // this implies that the modelled LJ fluid has cutoff == SPH cutoff @@ -163,7 +210,7 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, // artificial viscosity (Monaghan 1992) numtyp fvisc = (numtyp)0; if (delVdotDelR < (numyp)0) { - mu = h * delVdotDelR / (rsq + (numyp)0.01 * h * h); + numtyp mu = h * delVdotDelR / (rsq + (numyp)0.01 * h * h); fvisc = -coeffx * (ci + cj) * mu / (rhoi + rhoj); // viscosity[itype][jtype] } @@ -202,18 +249,18 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, } __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict extra, - const __global numtyp4 *restrict coeff_in, - const __global numtyp *restrict sp_lj_in, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp3 *restrict ans, - __global acctyp *restrict engv, - __global acctyp *restrict drhoE, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, - const __global numtyp4 *restrict v_, - const int t_per_atom) { + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff_in, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + __global acctyp *restrict drhoE, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp4 *restrict v_, + const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -229,7 +276,8 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, __syncthreads(); #else const numtyp coeffx=coeff_in[ONETYPE].x; // viscosity[itype][jtype] - const numtyp coeffy=coeff_in[ONETYPE].y; // cutsq[itype][jtype] + const numtyp coeffy=coeff_in[ONETYPE].y; // cut[itype][jtype] + const numtyp cutsq_p=coeff_in[ONETYPE].z; // cutsq[itype][jtype] #endif int n_stride; @@ -242,7 +290,8 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, energy=(acctyp)0; for (int i=0; i<6; i++) virial[i]=(acctyp)0; } - acctyp Qi = (acctyp)0; + acctyp2 drhoEacc; + drhoEacc.x = drhoEacc.x = (acctyp)0; if (iidimension == 3 Lucy Kernel, 3d + wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; + + // Lucy Kernel, 2d + //wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; + + // function call to LJ EOS + numtyp fcj[2]; + LJEOS2(rhoj, esphj, cvj, fcj); + numtyp fj = fcj[0]; + numtyp cj = fcj[1]; + fj /= (rhoj * rhoj); + + // apply long-range correction to model a LJ fluid with cutoff + // this implies that the modelled LJ fluid has cutoff == SPH cutoff + numtyp lrc = (numtyp)-11.1701 * (ihcub * ihcub * ihcub - (numtyp)1.5 * ihcub); + fi += lrc; + fj += lrc; + + // dot product of velocity delta and distance vector numtyp delvx = iv.x - jv.x; numtyp delvy = iv.y - jv.y; numtyp delvz = iv.z - jv.z; - numtyp dot = delx*delvx + dely*delvy + delz*delvz; - numtyp vijeij = dot*rinv; + numtyp delVdotDelR = delx*delvx + dely*delvy + delz*delvz; - #ifndef ONETYPE - const numtyp coeffx=coeff[mtype].x; // a0[itype][jtype] - const numtyp coeffy=coeff[mtype].y; // gamma[itype][jtype] - #endif - - const numtyp4 Tcvj = extra[j]; - numtyp Tj = Tcvj.x; - numtyp cvj = Tcvj.y; - - unsigned int tag1=itag, tag2=jtag; - if (tag1 > tag2) { - tag1 = jtag; tag2 = itag; - } - numtyp randnum = (numtyp)0.0; - saru(tag1, tag2, seed, timestep, randnum); - - numtyp T_ij=(numtyp)0.5*(Ti+Tj); - numtyp4 T_pow; - T_pow.x = T_ij - (numtyp)1.0; - T_pow.y = T_pow.x*T_pow.x; - T_pow.z = T_pow.x*T_pow.y; - T_pow.w = T_pow.x*T_pow.z; - - numtyp power_d = coeff2x; // power[itype][jtype] - if (power_flag) { - numtyp factor = (numtyp)1.0; - factor += scx*T_pow.x + scy*T_pow.y + scz*T_pow.z + scw*T_pow.w; - power_d *= factor; + // artificial viscosity (Monaghan 1992) + numtyp fvisc = (numtyp)0; + if (delVdotDelR < (numyp)0) { + numtyp mu = h * delVdotDelR / (rsq + (numyp)0.01 * h * h); + fvisc = -coeffx * (ci + cj) * mu / (rhoi + rhoj); // viscosity[itype][jtype] } - power_d = MAX((numtyp)0.01,power_d); - numtyp wc = (numtyp)1.0 - r/coeffz; // cut[itype][jtype] - wc = MAX((numtyp)0.0,MIN((numtyp)1.0,wc)); - numtyp wr = ucl_pow((numtyp)wc, (numtyp)0.5*power_d); - - numtyp kboltz = (numtyp)1.0; - numtyp GammaIJ = coeffy; // gamma[itype][jtype] - numtyp SigmaIJ = (numtyp)4.0*GammaIJ*kboltz*Ti*Tj/(Ti+Tj); - SigmaIJ = ucl_sqrt(SigmaIJ); - - numtyp force = coeffx*T_ij*wc; // a0[itype][jtype] - force -= GammaIJ *wr*wr *dot*rinv; - force += SigmaIJ* wr *randnum * dtinvsqrt; - #ifndef ONETYPE - force *= factor_dpd*rinv; - #else - force *= rinv; - #endif + // total pair force & thermal energy increment + numtyp force = -massi * massj * (fi + fj + fvisc) * wfd; + numtyp deltaE = (numtyp)-0.5 * force * delVdotDelR; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; - // heat transfer - - if (r < coeff2w) { - numtyp wrT = (numtyp)1.0 - r/coeff2w; - wrT = MAX((numtyp)0.0,MIN((numtyp)1.0,wrT)); - wrT = ucl_pow(wrT, (numtyp)0.5*coeff2z); // powerT[itype][jtype] - numtyp randnumT = (numtyp)0; - saru(tag1, tag2, seed+tag1+tag2, timestep, randnumT); // randomT->gaussian(); - randnumT = MAX((numtyp)-5.0,MIN(randnum,(numtyp)5.0)); - - numtyp kappaT = coeff2y; // kappa[itype][jtype] - if (kappa_flag) { - numtyp factor = (numtyp)1.0; - factor += kcx*T_pow.x + kcy*T_pow.y + kcz*T_pow.z + kcw*T_pow.w; - kappaT *= factor; - } - - numtyp kij = cvi*cvj*kappaT * T_ij*T_ij; - numtyp alphaij = ucl_sqrt((numtyp)2.0*kboltz*kij); - - numtyp dQc = kij * wrT*wrT * (Tj - Ti )/(Ti*Tj); - numtyp dQd = wr*wr*( GammaIJ * vijeij*vijeij - SigmaIJ*SigmaIJ/mass_itype ) - SigmaIJ * wr *vijeij *randnum; - dQd /= (cvi+cvj); - numtyp dQr = alphaij * wrT * dtinvsqrt * randnumT; - Qi += (dQc + dQd + dQr ); - } - if (EVFLAG && eflag) { - numtyp e = (numtyp)0.5*coeffx*T_ij*coeffz * wc*wc; - #ifndef ONETYPE - energy+=factor_dpd*e; - #else + numtyp e = (numtyp)0; energy+=e; - #endif } if (EVFLAG && vflag) { virial[0] += delx*delx*force; @@ -401,6 +418,6 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, } // if ii store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); - store_drhoE(Qi,ii,inum,tid,t_per_atom,offset,Q); + store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,drhoE); }