/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov Peng Wang (Nvidia), penwang@nvidia.com Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ #ifndef GB_GPU_KERNEL #define GB_GPU_KERNEL #include "gb_gpu_extra.h" template static __inline__ __device__ void compute_eta_torque(numtyp m[9], numtyp m2[9], const int i, numtyp ans[9]) { numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]- m[2]*m[6]*m[4]+m[1]*m[6]*m[5]- m[3]*m[1]*m[8]+m[0]*m[4]*m[8]; den = (numtyp)1.0/den; numtyp shapex=_shape_(i,0); numtyp shapey=_shape_(i,1); numtyp shapez=_shape_(i,2); ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]- m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+ m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]- m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+ m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den; ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+ (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]- (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]- m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+ m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den; ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]- m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]- m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+ (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+ m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den; ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+ m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+ m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]- m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]- m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den; ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+ (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]- (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+ m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]- m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den; ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]- m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+ (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+ m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]- (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den; ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+ (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+ m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]- m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]- m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den; ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]- (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+ (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]- m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+ m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den; ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]- m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]- m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+ (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+ m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; } #include "gb_gpu_kernel.h" template __global__ void kernel_gayberne(const vec4* x_, const vec4 *q, const numtyp *gum, const numtyp *special_lj, const int *dev_nbor, const size_t nbor_pitch, acctyp *ans, size_t ans_pitch, int *err_flag, const bool eflag, const bool vflag, const int inum, const int nall) { __shared__ numtyp sp_lj[4]; // ii indexes the two interacting particles in gi int ii=threadIdx.x; if (ii<4) sp_lj[ii]=special_lj[ii]; ii+=INT_MUL(blockIdx.x,blockDim.x); __syncthreads(); if (ii(itype,jtype); numtyp epsilon = _epsilon_(itype,jtype); numtyp varrho = sigma/(h12+gum[0]*sigma); numtyp varrho6 = varrho*varrho*varrho; varrho6*=varrho6; numtyp varrho12 = varrho6*varrho6; u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; temp1 = temp1*(numtyp)24.0*epsilon; uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; numtyp temp2 = gpu_dot3(kappa,r12); uslj_rsq = uslj_rsq*ir*ir; dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]); dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]); dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]); } // torque for particle 1 { numtyp tempv[3], tempv2[3]; tempv[0] = -uslj_rsq*kappa[0]; tempv[1] = -uslj_rsq*kappa[1]; tempv[2] = -uslj_rsq*kappa[2]; gpu_row_times3(kappa,g1,tempv2); gpu_cross3(tempv,tempv2,tUr); } } } // Compute eta { eta = (numtyp)2.0*_lshape_(itype)*_lshape_(jtype); numtyp det_g12 = gpu_det3(g12); eta = pow(eta/det_g12,gum[1]); } // Compute teta numtyp temp[9], tempv[3], tempv2[3]; compute_eta_torque(g12,a1,itype,temp); numtyp temp1 = -eta*gum[1]; tempv[0] = temp1*temp[0]; tempv[1] = temp1*temp[1]; tempv[2] = temp1*temp[2]; gpu_cross3(a1,tempv,tempv2); teta[0] = tempv2[0]; teta[1] = tempv2[1]; teta[2] = tempv2[2]; tempv[0] = temp1*temp[3]; tempv[1] = temp1*temp[4]; tempv[2] = temp1*temp[5]; gpu_cross3(a1+3,tempv,tempv2); teta[0] += tempv2[0]; teta[1] += tempv2[1]; teta[2] += tempv2[2]; tempv[0] = temp1*temp[6]; tempv[1] = temp1*temp[7]; tempv[2] = temp1*temp[8]; gpu_cross3(a1+6,tempv,tempv2); teta[0] += tempv2[0]; teta[1] += tempv2[1]; teta[2] += tempv2[2]; } numtyp chi, dchi[3], tchi[3]; { // Compute chi and dchi // Compute b12 numtyp b2[9], b12[9]; { gpu_well_times3(jtype,a2,b12); gpu_transpose_times3(a2,b12,b2); gpu_plus3(b1,b2,b12); } // compute chi_12 r12[0]*=r; r12[1]*=r; r12[2]*=r; numtyp iota[3]; gpu_mldivide3(b12,r12,iota,err_flag); // -- iota is now iota/r iota[0]*=ir; iota[1]*=ir; iota[2]*=ir; r12[0]*=ir; r12[1]*=ir; r12[2]*=ir; chi = gpu_dot3(r12,iota); chi = pow(chi*(numtyp)2.0,gum[2]); // -- iota is now ok iota[0]*=r; iota[1]*=r; iota[2]*=r; numtyp temp1 = gpu_dot3(iota,r12); numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]); dchi[0] = temp2*(iota[0]-temp1*r12[0]); dchi[1] = temp2*(iota[1]-temp1*r12[1]); dchi[2] = temp2*(iota[2]-temp1*r12[2]); // compute t_chi numtyp tempv[3]; gpu_row_times3(iota,b1,tempv); gpu_cross3(tempv,iota,tchi); temp1 = (numtyp)-4.0*ir*ir; tchi[0] *= temp1; tchi[1] *= temp1; tchi[2] *= temp1; } numtyp temp2 = factor_lj*eta*chi; if (eflag) energy+=u_r*temp2; numtyp temp1 = -eta*u_r*factor_lj; if (vflag) { r12[0]*=-r; r12[1]*=-r; r12[2]*=-r; numtyp ft=temp1*dchi[0]-temp2*dUr[0]; fx+=ft; virial[0]+=r12[0]*ft; ft=temp1*dchi[1]-temp2*dUr[1]; fy+=ft; virial[1]+=r12[1]*ft; virial[3]+=r12[0]*ft; ft=temp1*dchi[2]-temp2*dUr[2]; fz+=ft; virial[2]+=r12[2]*ft; virial[4]+=r12[0]*ft; virial[5]+=r12[1]*ft; } else { fx+=temp1*dchi[0]-temp2*dUr[0]; fy+=temp1*dchi[1]-temp2*dUr[1]; fz+=temp1*dchi[2]-temp2*dUr[2]; } // Torque on 1 temp1 = -u_r*eta*factor_lj; temp2 = -u_r*chi*factor_lj; numtyp temp3 = -chi*eta*factor_lj; torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; } // for nbor // Store answers acctyp *ap1=ans+ii*ans_pitch; if (eflag) { *ap1=energy; ap1++; } if (vflag) { for (int i=0; i<6; i++) { *ap1=virial[i]; ap1++; } } *ap1=fx; ap1++; *ap1=fy; ap1++; *ap1=fz; ap1++; *ap1=torx; ap1++; *ap1=tory; ap1++; *ap1=torz; } // if ii } template __global__ void kernel_sphere_gb(const vec4 *x_, const vec4 *q, const numtyp *gum, const numtyp *special_lj, const int *dev_nbor, const size_t nbor_pitch, acctyp *ans, size_t ans_pitch, int *err_flag, const bool eflag, const bool vflag, const int start, const int inum, const int nall) { __shared__ numtyp sp_lj[4]; // ii indexes the two interacting particles in gi int ii=threadIdx.x; if (ii<4) sp_lj[ii]=special_lj[ii]; ii+=INT_MUL(blockIdx.x,blockDim.x)+start; __syncthreads(); if (ii(itype,0); numtyp one_well=_well_(itype,0); numtyp factor_lj; for ( ; nbor(itype,jtype); numtyp epsilon = _epsilon_(itype,jtype); numtyp varrho = sigma/(h12+gum[0]*sigma); numtyp varrho6 = varrho*varrho*varrho; varrho6*=varrho6; numtyp varrho12 = varrho6*varrho6; u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; temp1 = temp1*(numtyp)24.0*epsilon; uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; numtyp temp2 = gpu_dot3(kappa,r12hat); uslj_rsq = uslj_rsq*ir*ir; dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]); dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]); dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]); } } } // Compute eta { eta = (numtyp)2.0*_lshape_(itype)*_lshape_(jtype); numtyp det_g12 = gpu_det3(g12); eta = pow(eta/det_g12,gum[1]); } } numtyp chi, dchi[3]; { // Compute chi and dchi // Compute b12 numtyp b12[9]; { numtyp b2[9]; gpu_well_times3(jtype,a2,b12); gpu_transpose_times3(a2,b12,b2); b12[0]=b2[0]+one_well; b12[4]=b2[4]+one_well; b12[8]=b2[8]+one_well; b12[1]=b2[1]; b12[2]=b2[2]; b12[3]=b2[3]; b12[5]=b2[5]; b12[6]=b2[6]; b12[7]=b2[7]; } // compute chi_12 numtyp iota[3]; gpu_mldivide3(b12,r12,iota,err_flag); // -- iota is now iota/r iota[0]*=ir; iota[1]*=ir; iota[2]*=ir; chi = gpu_dot3(r12hat,iota); chi = pow(chi*(numtyp)2.0,gum[2]); // -- iota is now ok iota[0]*=r; iota[1]*=r; iota[2]*=r; numtyp temp1 = gpu_dot3(iota,r12hat); numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]); dchi[0] = temp2*(iota[0]-temp1*r12hat[0]); dchi[1] = temp2*(iota[1]-temp1*r12hat[1]); dchi[2] = temp2*(iota[2]-temp1*r12hat[2]); } numtyp temp2 = factor_lj*eta*chi; if (eflag) energy+=u_r*temp2; numtyp temp1 = -eta*u_r*factor_lj; if (vflag) { r12[0]*=-1; r12[1]*=-1; r12[2]*=-1; numtyp ft=temp1*dchi[0]-temp2*dUr[0]; fx+=ft; virial[0]+=r12[0]*ft; ft=temp1*dchi[1]-temp2*dUr[1]; fy+=ft; virial[1]+=r12[1]*ft; virial[3]+=r12[0]*ft; ft=temp1*dchi[2]-temp2*dUr[2]; fz+=ft; virial[2]+=r12[2]*ft; virial[4]+=r12[0]*ft; virial[5]+=r12[1]*ft; } else { fx+=temp1*dchi[0]-temp2*dUr[0]; fy+=temp1*dchi[1]-temp2*dUr[1]; fz+=temp1*dchi[2]-temp2*dUr[2]; } } // for nbor // Store answers acctyp *ap1=ans+ii*ans_pitch; if (eflag) { *ap1=energy; ap1++; } if (vflag) { for (int i=0; i<6; i++) { *ap1=virial[i]; ap1++; } } *ap1=fx; ap1++; *ap1=fy; ap1++; *ap1=fz; } // if ii } template __global__ void kernel_lj(const vec4 *x_, const numtyp *special_lj, const int *dev_nbor, const size_t nbor_pitch, const int *dev_ij, acctyp *ans, size_t ans_pitch, int *err_flag, const bool eflag, const bool vflag, const int start, const int inum, const int nall) { __shared__ numtyp sp_lj[4]; // ii indexes the two interacting particles in gi int ii=threadIdx.x; if (ii<4) sp_lj[ii]=special_lj[ii]; ii+=INT_MUL(blockIdx.x,blockDim.x)+start; __syncthreads(); if (ii(itype,jtype) && _form_(itype,jtype)==SPHERE_SPHERE) { r2inv=(numtyp)1.0/r2inv; numtyp r6inv = r2inv*r2inv*r2inv; numtyp force = r2inv*r6inv*(_lj1_(itype,jtype).x*r6inv- _lj1_(itype,jtype).y); force*=factor_lj; fx+=delx*force; fy+=dely*force; fz+=delz*force; if (eflag) { numtyp e=r6inv*(_lj3_(itype,jtype).x*r6inv- _lj3_(itype,jtype).y); energy+=factor_lj*(e-_offset_(1,1)); } if (vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor // Store answers acctyp *ap1=ans+ii*ans_pitch; if (eflag) { *ap1+=energy; ap1++; } if (vflag) { for (int i=0; i<6; i++) { *ap1+=virial[i]; ap1++; } } *ap1+=fx; ap1++; *ap1+=fy; ap1++; *ap1+=fz; } // if ii } template __global__ void kernel_lj_fast(const vec4 *x_, const numtyp *special_lj, const int *dev_nbor, const size_t nbor_pitch, const int *dev_ij, acctyp *ans, size_t ans_pitch,int *err_flag, const bool eflag, const bool vflag, const int start, const int inum, const int nall){ // ii indexes the two interacting particles in gi int ii=threadIdx.x; __shared__ numtyp sp_lj[4]; __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; if (ii<4) sp_lj[ii]=special_lj[ii]; if (ii(itype,jtype); form[ii]=_form_(itype,jtype); lj1[ii]=_lj1_(itype,jtype).x; lj2[ii]=_lj1_(itype,jtype).y; if (eflag) { lj3[ii]=_lj3_(itype,jtype).x; lj4[ii]=_lj3_(itype,jtype).y; offset[ii]=_offset_(itype,jtype); } } ii+=INT_MUL(blockIdx.x,blockDim.x)+start; __syncthreads(); if (ii