diff --git a/lib/gpu/re_squared.cpp b/lib/gpu/re_squared.cpp index 83946dac98..a96454a1fa 100644 --- a/lib/gpu/re_squared.cpp +++ b/lib/gpu/re_squared.cpp @@ -96,12 +96,12 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, // Allocate, cast and asynchronous memcpy of constant data // Copy data for bonded interactions - gamma_upsilon_mu.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + special_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); host_write[0]=static_cast(host_special_lj[0]); host_write[1]=static_cast(host_special_lj[1]); host_write[2]=static_cast(host_special_lj[2]); host_write[3]=static_cast(host_special_lj[3]); - ucl_copy(gamma_upsilon_mu,host_write,4,false); + ucl_copy(special_lj,host_write,4,false); lshape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); UCL_H_Vec d_view; @@ -131,7 +131,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, _allocated=true; this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+ - lj1.row_bytes()+lj3.row_bytes()+gamma_upsilon_mu.row_bytes()+ + lj1.row_bytes()+lj3.row_bytes()+special_lj.row_bytes()+ lshape.row_bytes()+shape.row_bytes()+well.row_bytes(); return 0; @@ -159,7 +159,7 @@ void RESquaredT::clear() { shape.clear(); well.clear(); lshape.clear(); - gamma_upsilon_mu.clear(); + special_lj.clear(); this->clear_base(); } @@ -207,7 +207,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->k_ellipsoid.set_size(GX,BX); this->k_ellipsoid.run(&this->atom->dev_x.begin(), &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), - &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), + &this->special_lj.begin(), &this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, &anall, @@ -238,7 +238,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->k_sphere_ellipsoid.set_size(GX,BX); this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(), &this->atom->dev_quat.begin(), &this->shape.begin(), - &this->well.begin(), &this->gamma_upsilon_mu.begin(), + &this->well.begin(), &this->special_lj.begin(), &this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, @@ -262,7 +262,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { if (this->_shared_types) { this->k_lj_fast.set_size(GX,BX); this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(), - &this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride, + &this->lj3.begin(), &this->special_lj.begin(), &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, &ainum, &anall, @@ -270,7 +270,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { } else { this->k_lj.set_size(GX,BX); this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(), - &this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(), + &this->lj3.begin(), &this->_lj_types, &this->special_lj.begin(), &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, &ainum, &anall, @@ -287,7 +287,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->k_ellipsoid.set_size(GX,BX); this->k_ellipsoid.run(&this->atom->dev_x.begin(), &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), - &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), + &this->special_lj.begin(), &this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(), &ainum, &this->ans->dev_engv.begin(), &this->dev_error.begin(), diff --git a/lib/gpu/re_squared.cu b/lib/gpu/re_squared.cu index 9d8c49b7f0..62b242ea48 100644 --- a/lib/gpu/re_squared.cu +++ b/lib/gpu/re_squared.cu @@ -41,7 +41,7 @@ __inline numtyp det_prime(const numtyp m[9], const numtyp m2[9]) __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, __global numtyp4* shape, __global numtyp4* well, - __global numtyp *gum, __global numtyp2* sig_eps, + __global numtyp *splj, __global numtyp2* sig_eps, const int ntypes, __global numtyp *lshape, __global int *dev_nbor, const int stride, __global acctyp4 *ans, const int astride, @@ -54,10 +54,10 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, int offset=tid%t_per_atom; __local numtyp sp_lj[4]; - sp_lj[0]=gum[0]; - sp_lj[1]=gum[1]; - sp_lj[2]=gum[2]; - sp_lj[3]=gum[3]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; + sp_lj[3]=splj[3]; __local numtyp b_alpha, cr60; b_alpha=(numtyp)45.0/(numtyp)56.0; @@ -242,8 +242,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, Ur = ((numtyp)1.0+b_alpha*tprod)*sprod/Ur; Ur = epsilon*Ur*pow(sigh,(numtyp)6.0)/(numtyp)2025.0; - if (eflag>0) - energy+=Ua+Ur; + energy+=Ua+Ur; // force @@ -399,7 +398,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu; dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr; - tor.x += -(dUa*Ua+dUr*Ur); + tor.x -= (dUa*Ua+dUr*Ur); } { @@ -425,7 +424,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu; dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr; - tor.y += -(dUa*Ua+dUr*Ur); + tor.y -= (dUa*Ua+dUr*Ur); } { @@ -451,7 +450,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu; dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr; - tor.z += -(dUa*Ua+dUr*Ur); + tor.z -= (dUa*Ua+dUr*Ur); } } // for nbor diff --git a/lib/gpu/re_squared.h b/lib/gpu/re_squared.h index 85fc738d07..aadb365dc3 100644 --- a/lib/gpu/re_squared.h +++ b/lib/gpu/re_squared.h @@ -70,8 +70,8 @@ class RESquared : public BaseEllipsoid { UCL_D_Vec lj3; /// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon UCL_D_Vec sigma_epsilon; - // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ... - UCL_D_Vec gamma_upsilon_mu; + /// special lj 0-4 + UCL_D_Vec special_lj; /// If atom type constants fit in shared memory, use fast kernels bool _shared_types; diff --git a/lib/gpu/re_squared_lj.cu b/lib/gpu/re_squared_lj.cu index 910c1e4ae3..febd880ba5 100644 --- a/lib/gpu/re_squared_lj.cu +++ b/lib/gpu/re_squared_lj.cu @@ -24,9 +24,320 @@ #define NEIGHMASK 0x3FFFFFFF __inline int sbmask(int j) { return j >> SBBITS & 3; } + +__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q, + __global numtyp4* shape, __global numtyp4* well, + __global numtyp *splj, __global numtyp2* sig_eps, + const int ntypes, __global numtyp *lshape, + __global int *dev_nbor, const int stride, + __global acctyp4 *ans, const int astride, + __global acctyp *engv, __global int *err_flag, + const int eflag, const int vflag, const int inum, + const int nall, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + + __local numtyp sp_lj[4]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; + sp_lj[3]=splj[3]; + + __local numtyp b_alpha, cr60, solv_f_a, solv_f_r; + b_alpha=(numtyp)45.0/(numtyp)56.0; + cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0); + solv_f_a = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*-(numtyp)36.0); + solv_f_r = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*(numtyp)2025.0); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp4 tor; + tor.x=(acctyp)0; + tor.y=(acctyp)0; + tor.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (iibody) + numtyp aTe[9]; // A'*E + numtyp lA_0[9], lA_1[9], lA_2[9]; // -A*rotation generator (x,y, or z) + numtyp4 ishape; + + ishape=shape[itype]; + + { + gpu_quat_to_mat_trans(q,i,a); + gpu_transpose_times_diag3(a,well[itype],aTe); + gpu_rotation_generator_x(a,lA_0); + gpu_rotation_generator_y(a,lA_1); + gpu_rotation_generator_z(a,lA_2); + } + + numtyp factor_lj; + for ( ; nbor0) + virial[0]+=-r[0]*force; + } else if (i==1) { + f.y+=force; + if (vflag>0) { + virial[1]+=-r[1]*force; + virial[3]+=-r[0]*force; + } + } else { + f.z+=force; + if (vflag>0) { + virial[2]+=-r[2]*force; + virial[4]+=-r[0]*force; + virial[5]+=-r[1]*force; + } + } + + } + + // torque on i + numtyp fwae[3]; + gpu_row_times3(fourw,aTe,fwae); + { + numtyp tempv[3], p[3], lAtwo[9]; + gpu_times_column3(lA_0,rhat,p); + gpu_times_column3(lA_0,w,tempv); + numtyp dchi = -gpu_dot3(fwae,tempv); + gpu_times3(aTs,lA_0,lAtwo); + gpu_times_column3(lAtwo,spr,tempv); + numtyp dh12 = -gpu_dot3(s,tempv); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + tor.x -= (dUa*Ua+dUr*Ur); + } + + { + numtyp tempv[3], p[3], lAtwo[9]; + gpu_times_column3(lA_1,rhat,p); + gpu_times_column3(lA_1,w,tempv); + numtyp dchi = -gpu_dot3(fwae,tempv); + gpu_times3(aTs,lA_1,lAtwo); + gpu_times_column3(lAtwo,spr,tempv); + numtyp dh12 = -gpu_dot3(s,tempv); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + tor.y -= (dUa*Ua+dUr*Ur); + } + + { + numtyp tempv[3], p[3], lAtwo[9]; + gpu_times_column3(lA_2,rhat,p); + gpu_times_column3(lA_2,w,tempv); + numtyp dchi = -gpu_dot3(fwae,tempv); + gpu_times3(aTs,lA_2,lAtwo); + gpu_times_column3(lAtwo,spr,tempv); + numtyp dh12 = -gpu_dot3(s,tempv); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + tor.z -= (dUa*Ua+dUr*Ur); + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[7][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=tor.x; + red_acc[4][tid]=tor.y; + red_acc[5][tid]=tor.z; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + tor.x=red_acc[3][tid]; + tor.y=red_acc[4][tid]; + tor.z=red_acc[5][tid]; + + if (eflag>0 || vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + red_acc[6][tid]=energy; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<7; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + energy=red_acc[6][tid]; + } + } + + // Store answers + if (ii0) { + *ap1=energy; + ap1+=astride; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=astride; + } + } + ans[ii]=f; + ans[ii+astride]=tor; + } // if ii + +} + __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, __global numtyp4* shape,__global numtyp4* well, - __global numtyp *gum, __global numtyp2* sig_eps, + __global numtyp *splj, __global numtyp2* sig_eps, const int ntypes, __global numtyp *lshape, __global int *dev_nbor, const int stride, __global acctyp4 *ans, __global acctyp *engv, @@ -39,10 +350,16 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, int offset=tid%t_per_atom; __local numtyp sp_lj[4]; - sp_lj[0]=gum[3]; - sp_lj[1]=gum[4]; - sp_lj[2]=gum[5]; - sp_lj[3]=gum[6]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; + sp_lj[3]=splj[3]; + + __local numtyp b_alpha, cr60, solv_f_a, solv_f_r; + b_alpha=(numtyp)45.0/(numtyp)56.0; + cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0); + solv_f_a = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*-(numtyp)36.0); + solv_f_r = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*(numtyp)2025.0); acctyp energy=(acctyp)0; acctyp4 f; @@ -55,208 +372,175 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, if (iibody) + numtyp aTe[9]; // A'*E + numtyp4 ishape; + + ishape=shape[itype]; + gpu_quat_to_mat_trans(q,i,a); + gpu_transpose_times_diag3(a,well[itype],aTe); // Compute r12 - numtyp r12[3]; - r12[0] = jx.x-ix.x; - r12[1] = jx.y-ix.y; - r12[2] = jx.z-ix.z; - numtyp ir = gpu_dot3(r12,r12); + numtyp r[3], rhat[3]; + numtyp rnorm; + r[0] = jx.x-ix.x; + r[1] = jx.y-ix.y; + r[2] = jx.z-ix.z; + rnorm = gpu_dot3(r,r); + rnorm = rsqrt(rnorm); + rhat[0] = r[0]*rnorm; + rhat[1] = r[1]*rnorm; + rhat[2] = r[2]*rnorm; - ir = rsqrt(ir); - numtyp r = (numtyp)1.0/ir; + numtyp sigma, epsilon; + int mtype=mul24(ntypes,itype)+jtype; + sigma = sig_eps[mtype].x; + epsilon = sig_eps[mtype].y; + + numtyp aTs[9]; + numtyp4 scorrect; + numtyp half_sigma=sigma / (numtyp)2.0; + scorrect.x = ishape.x+half_sigma; + scorrect.y = ishape.y+half_sigma; + scorrect.z = ishape.z+half_sigma; + scorrect.x = scorrect.x * scorrect.x / (numtyp)2.0; + scorrect.y = scorrect.y * scorrect.y / (numtyp)2.0; + scorrect.z = scorrect.z * scorrect.z / (numtyp)2.0; + gpu_transpose_times_diag3(a,scorrect,aTs); - numtyp r12hat[3]; - r12hat[0]=r12[0]*ir; - r12hat[1]=r12[1]*ir; - r12hat[2]=r12[2]*ir; + // energy - numtyp a2[9]; - gpu_quat_to_mat_trans(q,j,a2); - - numtyp u_r, dUr[3], eta; - { // Compute U_r, dUr, eta, and teta - // Compute g12 - numtyp g12[9]; - { - { - numtyp g2[9]; - gpu_diag_times3(shape[jtype],a2,g12); - gpu_transpose_times3(a2,g12,g2); - g12[0]=g2[0]+oner; - g12[4]=g2[4]+oner; - g12[8]=g2[8]+oner; - g12[1]=g2[1]; - g12[2]=g2[2]; - g12[3]=g2[3]; - g12[5]=g2[5]; - g12[6]=g2[6]; - g12[7]=g2[7]; - } - - { // Compute U_r and dUr + numtyp gamma[9], s[3]; + gpu_times3(aTs,a,gamma); + gpu_mldivide3(gamma,rhat,s,err_flag); + + numtyp sigma12 = (numtyp)1.0/sqrt((numtyp)0.5*gpu_dot3(s,rhat)); + numtyp temp[9], w[3]; + gpu_times3(aTe,a,temp); + temp[0] += (numtyp)1.0; + temp[4] += (numtyp)1.0; + temp[8] += (numtyp)1.0; + gpu_mldivide3(temp,rhat,w,err_flag); + + numtyp h12 = rnorm-sigma12; + numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w); + numtyp sigh = sigma/h12; + numtyp tprod = chi*sigh; + + numtyp Ua, Ur; + numtyp h12p3 = pow(h12,(numtyp)3.0); + numtyp sigmap3 = pow(sigma,(numtyp)3.0); + numtyp stemp = h12/(numtyp)2.0; + Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0; + Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*lshape[itype]/Ua; + Ua = epsilon*Ua*sigmap3*solv_f_a; - // Compute kappa - numtyp kappa[3]; - gpu_mldivide3(g12,r12,kappa,err_flag); + stemp = h12/cr60; + Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/ + (numtyp)60.0; + Ur = ((numtyp)1.0+b_alpha*tprod)*lshape[itype]/Ur; + Ur = epsilon*Ur*sigmap3*pow(sigh,(numtyp)6.0)*solv_f_r; - // -- kappa is now / r - kappa[0]*=ir; - kappa[1]*=ir; - kappa[2]*=ir; + energy+=Ua+Ur; + + // force + + numtyp fourw[3], spr[3]; + numtyp sec = sigma*chi; + numtyp sigma12p3 = pow(sigma12,(numtyp)3.0); + fourw[0] = (numtyp)4.0*w[0]; + fourw[1] = (numtyp)4.0*w[1]; + fourw[2] = (numtyp)4.0*w[2]; + spr[0] = (numtyp)0.5*sigma12p3*s[0]; + spr[1] = (numtyp)0.5*sigma12p3*s[1]; + spr[2] = (numtyp)0.5*sigma12p3*s[2]; + + stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+ + (numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+ + (numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+ + (numtyp)3.0/h12; + numtyp hsec = h12+(numtyp)3.0*sec; + numtyp dspu = (numtyp)1.0/h12-(numtyp)1.0/hsec+stemp; + numtyp pbsu = (numtyp)3.0*sigma/hsec; - // energy + stemp = (numtyp)1.0/(ishape.x*cr60+h12)+ + (numtyp)1.0/(ishape.y*cr60+h12)+ + (numtyp)1.0/(ishape.z*cr60+h12)+ + (numtyp)3.0/h12; + hsec = h12+b_alpha*sec; + numtyp dspr = (numtyp)7.0/h12-(numtyp)1.0/hsec+stemp; + numtyp pbsr = b_alpha*sigma/hsec; - // compute u_r and dUr - numtyp uslj_rsq; - { - // Compute distance of closest approach - numtyp h12, sigma12; - sigma12 = gpu_dot3(r12hat,kappa); - sigma12 = rsqrt((numtyp)0.5*sigma12); - h12 = r-sigma12; - - // -- kappa is now ok - kappa[0]*=r; - kappa[1]*=r; - kappa[2]*=r; - - int mtype=mul24(ntypes,itype)+jtype; - numtyp sigma = sig_eps[mtype].x; - numtyp epsilon = sig_eps[mtype].y; - numtyp varrho = sigma/(h12+gum[0]*sigma); - numtyp varrho6 = varrho*varrho*varrho; - varrho6*=varrho6; - numtyp varrho12 = varrho6*varrho6; - u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); - - numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; - temp1 = temp1*(numtyp)24.0*epsilon; - uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; - numtyp temp2 = gpu_dot3(kappa,r12hat); - uslj_rsq = uslj_rsq*ir*ir; - - dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]); - dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]); - dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]); - } + #pragma unroll + for (int i=0; i<3; i++) { + numtyp u[3]; + u[0] = -rhat[i]*rhat[0]; + u[1] = -rhat[i]*rhat[1]; + u[2] = -rhat[i]*rhat[2]; + u[i] += (numtyp)1.0; + u[0] /= rnorm; + u[1] /= rnorm; + u[2] /= rnorm; + numtyp dchi = gpu_dot3(u,fourw); + numtyp dh12 = rhat[i]+gpu_dot3(u,spr); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + numtyp force=dUr*Ur+dUa*Ua; + if (i==0) { + f.x+=force; + if (vflag>0) + virial[0]+=-r[0]*force; + } else if (i==1) { + f.y+=force; + if (vflag>0) { + virial[1]+=-r[1]*force; + virial[3]+=-r[0]*force; + } + } else { + f.z+=force; + if (vflag>0) { + virial[2]+=-r[2]*force; + virial[4]+=-r[0]*force; + virial[5]+=-r[1]*force; } } - - // Compute eta - { - eta = (numtyp)2.0*lshape[itype]*lshape[jtype]; - numtyp det_g12 = gpu_det3(g12); - eta = pow(eta/det_g12,gum[1]); - } - } - - numtyp chi, dchi[3]; - { // Compute chi and dchi - - // Compute b12 - numtyp b12[9]; - { - numtyp b2[9]; - gpu_diag_times3(well[jtype],a2,b12); - gpu_transpose_times3(a2,b12,b2); - b12[0]=b2[0]+one_well; - b12[4]=b2[4]+one_well; - b12[8]=b2[8]+one_well; - b12[1]=b2[1]; - b12[2]=b2[2]; - b12[3]=b2[3]; - b12[5]=b2[5]; - b12[6]=b2[6]; - b12[7]=b2[7]; - } - - // compute chi_12 - numtyp iota[3]; - gpu_mldivide3(b12,r12,iota,err_flag); - // -- iota is now iota/r - iota[0]*=ir; - iota[1]*=ir; - iota[2]*=ir; - chi = gpu_dot3(r12hat,iota); - chi = pow(chi*(numtyp)2.0,gum[2]); - - // -- iota is now ok - iota[0]*=r; - iota[1]*=r; - iota[2]*=r; - - numtyp temp1 = gpu_dot3(iota,r12hat); - numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/ - gum[2]); - dchi[0] = temp2*(iota[0]-temp1*r12hat[0]); - dchi[1] = temp2*(iota[1]-temp1*r12hat[1]); - dchi[2] = temp2*(iota[2]-temp1*r12hat[2]); - } - - numtyp temp2 = factor_lj*eta*chi; - if (eflag>0) - energy+=u_r*temp2; - numtyp temp1 = -eta*u_r*factor_lj; - if (vflag>0) { - r12[0]*=-1; - r12[1]*=-1; - r12[2]*=-1; - numtyp ft=temp1*dchi[0]-temp2*dUr[0]; - f.x+=ft; - virial[0]+=r12[0]*ft; - ft=temp1*dchi[1]-temp2*dUr[1]; - f.y+=ft; - virial[1]+=r12[1]*ft; - virial[3]+=r12[0]*ft; - ft=temp1*dchi[2]-temp2*dUr[2]; - f.z+=ft; - virial[2]+=r12[2]*ft; - virial[4]+=r12[0]*ft; - virial[5]+=r12[1]*ft; - } else { - f.x+=temp1*dchi[0]-temp2*dUr[0]; - f.y+=temp1*dchi[1]-temp2*dUr[1]; - f.z+=temp1*dchi[2]-temp2*dUr[2]; + } + } // for nbor } // if ii // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_PAIR]; + __local acctyp red_acc[7][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; red_acc[2][tid]=f.z; - red_acc[3][tid]=energy; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { if (offset < s) { - for (int r=0; r<4; r++) + for (int r=0; r<3; r++) red_acc[r][tid] += red_acc[r][tid+s]; } } @@ -264,21 +548,22 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, f.x=red_acc[0][tid]; f.y=red_acc[1][tid]; f.z=red_acc[2][tid]; - energy=red_acc[3][tid]; - if (vflag>0) { + if (eflag>0 || vflag>0) { for (int r=0; r<6; r++) red_acc[r][tid]=virial[r]; + red_acc[6][tid]=energy; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { if (offset < s) { - for (int r=0; r<6; r++) + for (int r=0; r<7; r++) red_acc[r][tid] += red_acc[r][tid+s]; } } for (int r=0; r<6; r++) virial[r]=red_acc[r][tid]; + energy=red_acc[6][tid]; } }