/*************************************************************************** gb_gpu_kernel.cu ------------------- W. Michael Brown Routines that actually perform the force/torque computation *** Force Decomposition by Atom Version *** __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Tue Jun 23 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #ifndef GB_GPU_KERNEL #define GB_GPU_KERNEL #include "gb_gpu_extra.h" template static __inline__ __device__ void compute_eta_torque(numtyp m[9], numtyp m2[9], const int i, numtyp ans[9]) { numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]- m[2]*m[6]*m[4]+m[1]*m[6]*m[5]- m[3]*m[1]*m[8]+m[0]*m[4]*m[8]; den = (numtyp)1.0/den; numtyp shapex=_shape_(i,0); numtyp shapey=_shape_(i,1); numtyp shapez=_shape_(i,2); ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]- m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+ m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]- m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+ m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den; ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+ (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]- (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]- m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+ m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den; ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]- m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]- m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+ (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+ m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den; ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+ m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+ m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]- m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]- m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den; ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+ (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]- (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+ m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]- m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den; ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]- m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+ (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+ m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]- (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den; ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+ (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+ m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]- m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]- m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den; ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]- (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+ (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]- m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+ m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den; ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]- m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]- m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+ (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+ m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; } template __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj, const int *dev_nbor, const int nbor_pitch, acctyp *ans, size_t ans_pitch, int *err_flag, const bool eflag, const bool vflag, const int inum, const int nall) { __shared__ numtyp sp_lj[4]; // ii indexes the two interacting particles in gi int ii=threadIdx.x; if (ii<4) sp_lj[ii]=special_lj[ii]; ii+=INT_MUL(blockIdx.x,blockDim.x); __syncthreads(); if (ii(i,0); numtyp iy=_x_(i,1); numtyp iz=_x_(i,2); int itype=_x_(i,7); numtyp a1[9], b1[9], g1[9]; { numtyp t[9]; gpu_quat_to_mat_trans(i,a1); gpu_shape_times3(itype,a1,t); gpu_transpose_times3(a1,t,g1); gpu_well_times3(itype,a1,t); gpu_transpose_times3(a1,t,b1); } numtyp factor_lj; for ( ; nbor(j,7); // Compute r12 numtyp r12[3]; r12[0] = _x_(j,0)-ix; r12[1] = _x_(j,1)-iy; r12[2] = _x_(j,2)-iz; numtyp ir = gpu_dot3(r12,r12); ir = rsqrt(ir); numtyp r = (numtyp)1.0/ir; numtyp a2[9]; gpu_quat_to_mat_trans(j,a2); numtyp u_r, dUr[3], tUr[3], eta, teta[3]; { // Compute U_r, dUr, eta, and teta // Compute g12 numtyp g12[9]; { numtyp g2[9]; { gpu_shape_times3(jtype,a2,g12); gpu_transpose_times3(a2,g12,g2); gpu_plus3(g1,g2,g12); } { // Compute U_r and dUr // Compute kappa numtyp kappa[3]; gpu_mldivide3(g12,r12,kappa,err_flag); // -- replace r12 with r12 hat r12[0]*=ir; r12[1]*=ir; r12[2]*=ir; // -- kappa is now / r kappa[0]*=ir; kappa[1]*=ir; kappa[2]*=ir; // energy // compute u_r and dUr numtyp uslj_rsq; { // Compute distance of closest approach numtyp h12, sigma12; sigma12 = gpu_dot3(r12,kappa); sigma12 = rsqrt((numtyp)0.5*sigma12); h12 = r-sigma12; // -- kappa is now ok kappa[0]*=r; kappa[1]*=r; kappa[2]*=r; numtyp sigma = _sigma_(itype,jtype); numtyp epsilon = _epsilon_(itype,jtype); numtyp varrho = sigma/(h12+gum[0]*sigma); numtyp varrho6 = varrho*varrho*varrho; varrho6*=varrho6; numtyp varrho12 = varrho6*varrho6; u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; temp1 = temp1*(numtyp)24.0*epsilon; uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; numtyp temp2 = gpu_dot3(kappa,r12); uslj_rsq = uslj_rsq*ir*ir; dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]); dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]); dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]); } // torque for particle 1 { numtyp tempv[3], tempv2[3]; tempv[0] = -uslj_rsq*kappa[0]; tempv[1] = -uslj_rsq*kappa[1]; tempv[2] = -uslj_rsq*kappa[2]; gpu_row_times3(kappa,g1,tempv2); gpu_cross3(tempv,tempv2,tUr); } } } // Compute eta { eta = (numtyp)2.0*_lshape_(itype)*_lshape_(jtype); numtyp det_g12 = gpu_det3(g12); eta = pow(eta/det_g12,gum[1]); } // Compute teta numtyp temp[9], tempv[3], tempv2[3]; compute_eta_torque(g12,a1,itype,temp); numtyp temp1 = -eta*gum[1]; tempv[0] = temp1*temp[0]; tempv[1] = temp1*temp[1]; tempv[2] = temp1*temp[2]; gpu_cross3(a1,tempv,tempv2); teta[0] = tempv2[0]; teta[1] = tempv2[1]; teta[2] = tempv2[2]; tempv[0] = temp1*temp[3]; tempv[1] = temp1*temp[4]; tempv[2] = temp1*temp[5]; gpu_cross3(a1+3,tempv,tempv2); teta[0] += tempv2[0]; teta[1] += tempv2[1]; teta[2] += tempv2[2]; tempv[0] = temp1*temp[6]; tempv[1] = temp1*temp[7]; tempv[2] = temp1*temp[8]; gpu_cross3(a1+6,tempv,tempv2); teta[0] += tempv2[0]; teta[1] += tempv2[1]; teta[2] += tempv2[2]; } numtyp chi, dchi[3], tchi[3]; { // Compute chi and dchi // Compute b12 numtyp b2[9], b12[9]; { gpu_well_times3(jtype,a2,b12); gpu_transpose_times3(a2,b12,b2); gpu_plus3(b1,b2,b12); } // compute chi_12 r12[0]*=r; r12[1]*=r; r12[2]*=r; numtyp iota[3]; gpu_mldivide3(b12,r12,iota,err_flag); // -- iota is now iota/r iota[0]*=ir; iota[1]*=ir; iota[2]*=ir; r12[0]*=ir; r12[1]*=ir; r12[2]*=ir; chi = gpu_dot3(r12,iota); chi = pow(chi*(numtyp)2.0,gum[2]); // -- iota is now ok iota[0]*=r; iota[1]*=r; iota[2]*=r; numtyp temp1 = gpu_dot3(iota,r12); numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]); dchi[0] = temp2*(iota[0]-temp1*r12[0]); dchi[1] = temp2*(iota[1]-temp1*r12[1]); dchi[2] = temp2*(iota[2]-temp1*r12[2]); // compute t_chi numtyp tempv[3]; gpu_row_times3(iota,b1,tempv); gpu_cross3(tempv,iota,tchi); temp1 = (numtyp)-4.0*ir*ir; tchi[0] *= temp1; tchi[1] *= temp1; tchi[2] *= temp1; } numtyp temp2 = factor_lj*eta*chi; if (eflag) energy+=u_r*temp2; numtyp temp1 = -eta*u_r*factor_lj; if (vflag) { r12[0]*=-r; r12[1]*=-r; r12[2]*=-r; numtyp ft=temp1*dchi[0]-temp2*dUr[0]; fx+=ft; virial[0]+=r12[0]*ft; ft=temp1*dchi[1]-temp2*dUr[1]; fy+=ft; virial[1]+=r12[1]*ft; virial[3]+=r12[0]*ft; ft=temp1*dchi[2]-temp2*dUr[2]; fz+=ft; virial[2]+=r12[2]*ft; virial[4]+=r12[0]*ft; virial[5]+=r12[1]*ft; } else { fx+=temp1*dchi[0]-temp2*dUr[0]; fy+=temp1*dchi[1]-temp2*dUr[1]; fz+=temp1*dchi[2]-temp2*dUr[2]; } // Torque on 1 temp1 = -u_r*eta*factor_lj; temp2 = -u_r*chi*factor_lj; numtyp temp3 = -chi*eta*factor_lj; torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; } // for nbor // Store answers acctyp *ap1=ans+ii; if (eflag) { *ap1=energy; ap1+=ans_pitch; } if (vflag) { for (int i=0; i<6; i++) { *ap1=virial[i]; ap1+=ans_pitch; } } *ap1=fx; ap1+=ans_pitch; *ap1=fy; ap1+=ans_pitch; *ap1=fz; ap1+=ans_pitch; *ap1=torx; ap1+=ans_pitch; *ap1=tory; ap1+=ans_pitch; *ap1=torz; } // if ii } template __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj, const int *dev_nbor, const int nbor_pitch, acctyp *ans, size_t ans_pitch, int *err_flag, const bool eflag, const bool vflag, const int start, const int inum, const int nall) { __shared__ numtyp sp_lj[4]; // ii indexes the two interacting particles in gi int ii=threadIdx.x; if (ii<4) sp_lj[ii]=special_lj[ii]; ii+=INT_MUL(blockIdx.x,blockDim.x)+start; __syncthreads(); if (ii(i,0); numtyp iy=_x_(i,1); numtyp iz=_x_(i,2); int itype=_x_(i,7); numtyp oner=_shape_(itype,0); numtyp one_well=_well_(itype,0); numtyp factor_lj; for ( ; nbor(j,7); // Compute r12 numtyp r12[3]; r12[0] = _x_(j,0)-ix; r12[1] = _x_(j,1)-iy; r12[2] = _x_(j,2)-iz; numtyp ir = gpu_dot3(r12,r12); ir = rsqrt(ir); numtyp r = (numtyp)1.0/ir; numtyp r12hat[3]; r12hat[0]=r12[0]*ir; r12hat[1]=r12[1]*ir; r12hat[2]=r12[2]*ir; numtyp a2[9]; gpu_quat_to_mat_trans(j,a2); numtyp u_r, dUr[3], eta; { // Compute U_r, dUr, eta, and teta // Compute g12 numtyp g12[9]; { { numtyp g2[9]; gpu_shape_times3(jtype,a2,g12); gpu_transpose_times3(a2,g12,g2); g12[0]=g2[0]+oner; g12[4]=g2[4]+oner; g12[8]=g2[8]+oner; g12[1]=g2[1]; g12[2]=g2[2]; g12[3]=g2[3]; g12[5]=g2[5]; g12[6]=g2[6]; g12[7]=g2[7]; } { // Compute U_r and dUr // Compute kappa numtyp kappa[3]; gpu_mldivide3(g12,r12,kappa,err_flag); // -- kappa is now / r kappa[0]*=ir; kappa[1]*=ir; kappa[2]*=ir; // energy // compute u_r and dUr numtyp uslj_rsq; { // Compute distance of closest approach numtyp h12, sigma12; sigma12 = gpu_dot3(r12hat,kappa); sigma12 = rsqrt((numtyp)0.5*sigma12); h12 = r-sigma12; // -- kappa is now ok kappa[0]*=r; kappa[1]*=r; kappa[2]*=r; numtyp sigma = _sigma_(itype,jtype); numtyp epsilon = _epsilon_(itype,jtype); numtyp varrho = sigma/(h12+gum[0]*sigma); numtyp varrho6 = varrho*varrho*varrho; varrho6*=varrho6; numtyp varrho12 = varrho6*varrho6; u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; temp1 = temp1*(numtyp)24.0*epsilon; uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; numtyp temp2 = gpu_dot3(kappa,r12hat); uslj_rsq = uslj_rsq*ir*ir; dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]); dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]); dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]); } } } // Compute eta { eta = (numtyp)2.0*_lshape_(itype)*_lshape_(jtype); numtyp det_g12 = gpu_det3(g12); eta = pow(eta/det_g12,gum[1]); } } numtyp chi, dchi[3]; { // Compute chi and dchi // Compute b12 numtyp b12[9]; { numtyp b2[9]; gpu_well_times3(jtype,a2,b12); gpu_transpose_times3(a2,b12,b2); b12[0]=b2[0]+one_well; b12[4]=b2[4]+one_well; b12[8]=b2[8]+one_well; b12[1]=b2[1]; b12[2]=b2[2]; b12[3]=b2[3]; b12[5]=b2[5]; b12[6]=b2[6]; b12[7]=b2[7]; } // compute chi_12 numtyp iota[3]; gpu_mldivide3(b12,r12,iota,err_flag); // -- iota is now iota/r iota[0]*=ir; iota[1]*=ir; iota[2]*=ir; chi = gpu_dot3(r12hat,iota); chi = pow(chi*(numtyp)2.0,gum[2]); // -- iota is now ok iota[0]*=r; iota[1]*=r; iota[2]*=r; numtyp temp1 = gpu_dot3(iota,r12hat); numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]); dchi[0] = temp2*(iota[0]-temp1*r12hat[0]); dchi[1] = temp2*(iota[1]-temp1*r12hat[1]); dchi[2] = temp2*(iota[2]-temp1*r12hat[2]); } numtyp temp2 = factor_lj*eta*chi; if (eflag) energy+=u_r*temp2; numtyp temp1 = -eta*u_r*factor_lj; if (vflag) { r12[0]*=-1; r12[1]*=-1; r12[2]*=-1; numtyp ft=temp1*dchi[0]-temp2*dUr[0]; fx+=ft; virial[0]+=r12[0]*ft; ft=temp1*dchi[1]-temp2*dUr[1]; fy+=ft; virial[1]+=r12[1]*ft; virial[3]+=r12[0]*ft; ft=temp1*dchi[2]-temp2*dUr[2]; fz+=ft; virial[2]+=r12[2]*ft; virial[4]+=r12[0]*ft; virial[5]+=r12[1]*ft; } else { fx+=temp1*dchi[0]-temp2*dUr[0]; fy+=temp1*dchi[1]-temp2*dUr[1]; fz+=temp1*dchi[2]-temp2*dUr[2]; } } // for nbor // Store answers acctyp *ap1=ans+ii; if (eflag) { *ap1=energy; ap1+=ans_pitch; } if (vflag) { for (int i=0; i<6; i++) { *ap1=virial[i]; ap1+=ans_pitch; } } *ap1=fx; ap1+=ans_pitch; *ap1=fy; ap1+=ans_pitch; *ap1=fz; } // if ii } template __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, const int *dev_ij, const int nbor_pitch, acctyp *ans, size_t ans_pitch, int *err_flag, const bool eflag, const bool vflag, const int start, const int inum, const int nall) { __shared__ numtyp sp_lj[4]; // ii indexes the two interacting particles in gi int ii=threadIdx.x; if (ii<4) sp_lj[ii]=special_lj[ii]; ii+=INT_MUL(blockIdx.x,blockDim.x)+start; __syncthreads(); if (ii(i,0); numtyp iy=_x_(i,1); numtyp iz=_x_(i,2); int itype=_x_(i,7); numtyp factor_lj; for ( ; list(j,7); // Compute r12 numtyp delx = ix-_x_(j,0); numtyp dely = iy-_x_(j,1); numtyp delz = iz-_x_(j,2); numtyp r2inv = delx*delx+dely*dely+delz*delz; if (r2inv<_cutsq_(itype,jtype) && _form_(itype,jtype)==SPHERE_SPHERE) { r2inv=(numtyp)1.0/r2inv; numtyp r6inv = r2inv*r2inv*r2inv; numtyp force = r2inv*r6inv*(_lj1_(itype,jtype).x*r6inv- _lj1_(itype,jtype).y); force*=factor_lj; fx+=delx*force; fy+=dely*force; fz+=delz*force; if (eflag) { numtyp e=r6inv*(_lj3_(itype,jtype).x*r6inv- _lj3_(itype,jtype).y); energy+=factor_lj*(e-_offset_(1,1)); } if (vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor // Store answers acctyp *ap1=ans+ii; if (eflag) { *ap1+=energy; ap1+=ans_pitch; } if (vflag) { for (int i=0; i<6; i++) { *ap1+=virial[i]; ap1+=ans_pitch; } } *ap1+=fx; ap1+=ans_pitch; *ap1+=fy; ap1+=ans_pitch; *ap1+=fz; } // if ii } template __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, const int *dev_ij, const int nbor_pitch, acctyp *ans, size_t ans_pitch,int *err_flag, const bool eflag, const bool vflag, const int start, const int inum, const int nall){ // ii indexes the two interacting particles in gi int ii=threadIdx.x; __shared__ numtyp sp_lj[4]; __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; if (ii<4) sp_lj[ii]=special_lj[ii]; if (ii(itype,jtype); form[ii]=_form_(itype,jtype); lj1[ii]=_lj1_(itype,jtype).x; lj2[ii]=_lj1_(itype,jtype).y; if (eflag) { lj3[ii]=_lj3_(itype,jtype).x; lj4[ii]=_lj3_(itype,jtype).y; offset[ii]=_offset_(itype,jtype); } } ii+=INT_MUL(blockIdx.x,blockDim.x)+start; __syncthreads(); if (ii(i,0); numtyp iy=_x_(i,1); numtyp iz=_x_(i,2); int itype=INT_MUL(MAX_SHARED_TYPES,_x_(i,7)); numtyp factor_lj; for ( ; list(j,7); // Compute r12 numtyp delx = ix-_x_(j,0); numtyp dely = iy-_x_(j,1); numtyp delz = iz-_x_(j,2); numtyp r2inv = delx*delx+dely*dely+delz*delz; if (r2inv