// ************************************************************************** // ellipsoid_extra.h // ------------------- // W. Michael Brown (ORNL) // // Device code for Ellipsoid math routines // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : brownw@ornl.gov // ***************************************************************************/ #ifndef LAL_ELLIPSOID_EXTRA_H #define LAL_ELLIPSOID_EXTRA_H enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE _texture( pos_tex, float4); _texture( quat_tex,float4); #else _texture_2d( pos_tex,int4); _texture_2d( quat_tex,int4); #endif #else #define pos_tex x_ #define quat_tex qif #endif #define nbor_info_e_ss(nbor_mem, nbor_stride, t_per_atom, ii, offset, \ i, numj, stride, nbor_end, nbor_begin) \ i=nbor_mem[ii]; \ nbor_begin=ii+nbor_stride; \ numj=nbor_mem[nbor_begin]; \ nbor_begin+=nbor_stride; \ nbor_end=nbor_begin+fast_mul(nbor_stride,numj); \ nbor_begin+=fast_mul(offset,nbor_stride); \ stride=fast_mul(t_per_atom,nbor_stride); #if (SHUFFLE_AVAIL == 0) #define store_answers_t(f, tor, energy, virial, ii, astride, tid, \ t_per_atom, offset, eflag, vflag, ans, engv, inum) \ if (t_per_atom>1) { \ red_acc[0][tid]=f.x; \ red_acc[1][tid]=f.y; \ red_acc[2][tid]=f.z; \ red_acc[3][tid]=tor.x; \ red_acc[4][tid]=tor.y; \ red_acc[5][tid]=tor.z; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ simdsync(); \ if (offset < s) { \ for (int r=0; r<6; r++) \ red_acc[r][tid] += red_acc[r][tid+s]; \ } \ } \ f.x=red_acc[0][tid]; \ f.y=red_acc[1][tid]; \ f.z=red_acc[2][tid]; \ tor.x=red_acc[3][tid]; \ tor.y=red_acc[4][tid]; \ tor.z=red_acc[5][tid]; \ if (EVFLAG && (eflag || vflag)) { \ if (vflag) { \ simdsync(); \ for (int r=0; r<6; r++) \ red_acc[r][tid]=virial[r]; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ simdsync(); \ if (offset < s) { \ for (int r=0; r<6; r++) \ red_acc[r][tid] += red_acc[r][tid+s]; \ } \ } \ for (int r=0; r<6; r++) \ virial[r]=red_acc[r][tid]; \ } \ if (eflag) { \ simdsync(); \ red_acc[0][tid]=energy; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ simdsync(); \ if (offset < s) red_acc[0][tid] += red_acc[0][tid+s]; \ } \ } \ energy=red_acc[0][tid]; \ } \ } \ if (offset==0 && ii1) { \ red_acc[0][tid]=f.x; \ red_acc[1][tid]=f.y; \ red_acc[2][tid]=f.z; \ red_acc[3][tid]=energy; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ simdsync(); \ if (offset < s) { \ for (int r=0; r<4; r++) \ red_acc[r][tid] += red_acc[r][tid+s]; \ } \ } \ f.x=red_acc[0][tid]; \ f.y=red_acc[1][tid]; \ f.z=red_acc[2][tid]; \ energy=red_acc[3][tid]; \ if (EVFLAG && vflag) { \ for (int r=0; r<6; r++) \ red_acc[r][tid]=virial[r]; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ simdsync(); \ if (offset < s) { \ for (int r=0; r<6; r++) \ red_acc[r][tid] += red_acc[r][tid+s]; \ } \ } \ for (int r=0; r<6; r++) \ virial[r]=red_acc[r][tid]; \ } \ } \ if (offset==0 && ii1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ f.x += shfl_down(f.x, s, t_per_atom); \ f.y += shfl_down(f.y, s, t_per_atom); \ f.z += shfl_down(f.z, s, t_per_atom); \ tor.x += shfl_down(tor.x, s, t_per_atom); \ tor.y += shfl_down(tor.y, s, t_per_atom); \ tor.z += shfl_down(tor.z, s, t_per_atom); \ if (EVFLAG) energy += shfl_down(energy, s, t_per_atom); \ } \ if (EVFLAG && vflag) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ for (int r=0; r<6; r++) \ virial[r] += shfl_down(virial[r], s, t_per_atom); \ } \ } \ } \ if (offset==0 && ii1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ f.x += shfl_down(f.x, s, t_per_atom); \ f.y += shfl_down(f.y, s, t_per_atom); \ f.z += shfl_down(f.z, s, t_per_atom); \ if (EVFLAG) energy += shfl_down(energy, s, t_per_atom); \ } \ if (EVFLAG && vflag) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ for (int r=0; r<6; r++) \ virial[r] += shfl_down(virial[r], s, t_per_atom); \ } \ } \ } \ if (offset==0 && ii ucl_abs(aug[0])) { numtyp swapt; swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt; swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt; swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt; swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt; } if (ucl_abs(aug[8]) > ucl_abs(aug[0])) { numtyp swapt; swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt; swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt; swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt; swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt; } if (aug[0] != (numtyp)0.0) { if (0!=0) { numtyp swapt; swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt; swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt; swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt; swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt; } } else if (aug[4] != (numtyp)0.0) { if (1!=0) { numtyp swapt; swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt; swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt; swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt; swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt; } } else if (aug[8] != (numtyp)0.0) { if (2!=0) { numtyp swapt; swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt; swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt; swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt; swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt; } } else *error_flag=2; t = aug[4]/aug[0]; aug[5]-=t*aug[1]; aug[6]-=t*aug[2]; aug[7]-=t*aug[3]; t = aug[8]/aug[0]; aug[9]-=t*aug[1]; aug[10]-=t*aug[2]; aug[11]-=t*aug[3]; if (ucl_abs(aug[9]) > ucl_abs(aug[5])) { numtyp swapt; swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt; swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt; swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt; swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt; } if (aug[5] != (numtyp)0.0) { if (1!=1) { numtyp swapt; swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt; swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt; swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt; swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt; } } else if (aug[9] != (numtyp)0.0) { if (2!=1) { numtyp swapt; swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt; swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt; swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt; swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt; } } t = aug[9]/aug[5]; aug[10]-=t*aug[6]; aug[11]-=t*aug[7]; if (aug[10] == (numtyp)0.0) *error_flag=2; ans[2] = aug[11]/aug[10]; t = (numtyp)0.0; t += aug[6]*ans[2]; ans[1] = (aug[7]-t) / aug[5]; t = (numtyp)0.0; t += aug[1]*ans[1]; t += aug[2]*ans[2]; ans[0] = (aug[3]-t) / aug[0]; }; /* ---------------------------------------------------------------------- compute rotation matrix from quaternion conjugate quat = [w i j k] ------------------------------------------------------------------------- */ ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, numtyp mat[9]) { numtyp4 q; fetch4(q,qi,quat_tex); numtyp w2 = q.x*q.x; numtyp i2 = q.y*q.y; numtyp j2 = q.z*q.z; numtyp k2 = q.w*q.w; numtyp twoij = (numtyp)2.0*q.y*q.z; numtyp twoik = (numtyp)2.0*q.y*q.w; numtyp twojk = (numtyp)2.0*q.z*q.w; numtyp twoiw = (numtyp)2.0*q.y*q.x; numtyp twojw = (numtyp)2.0*q.z*q.x; numtyp twokw = (numtyp)2.0*q.w*q.x; mat[0] = w2+i2-j2-k2; mat[3] = twoij-twokw; mat[6] = twojw+twoik; mat[1] = twoij+twokw; mat[4] = w2-i2+j2-k2; mat[7] = twojk-twoiw; mat[2] = twoik-twojw; mat[5] = twojk+twoiw; mat[8] = w2-i2-j2+k2; }; /* ---------------------------------------------------------------------- transposed matrix times diagonal matrix ------------------------------------------------------------------------- */ ucl_inline void gpu_transpose_times_diag3(const numtyp m[9], const numtyp4 d, numtyp ans[9]) { ans[0] = m[0]*d.x; ans[1] = m[3]*d.y; ans[2] = m[6]*d.z; ans[3] = m[1]*d.x; ans[4] = m[4]*d.y; ans[5] = m[7]*d.z; ans[6] = m[2]*d.x; ans[7] = m[5]*d.y; ans[8] = m[8]*d.z; }; /* ---------------------------------------------------------------------- multiply mat1 times mat2 ------------------------------------------------------------------------- */ ucl_inline void gpu_times3(const numtyp m[9], const numtyp m2[9], numtyp ans[9]) { ans[0] = m[0]*m2[0] + m[1]*m2[3] + m[2]*m2[6]; ans[1] = m[0]*m2[1] + m[1]*m2[4] + m[2]*m2[7]; ans[2] = m[0]*m2[2] + m[1]*m2[5] + m[2]*m2[8]; ans[3] = m[3]*m2[0] + m[4]*m2[3] + m[5]*m2[6]; ans[4] = m[3]*m2[1] + m[4]*m2[4] + m[5]*m2[7]; ans[5] = m[3]*m2[2] + m[4]*m2[5] + m[5]*m2[8]; ans[6] = m[6]*m2[0] + m[7]*m2[3] + m[8]*m2[6]; ans[7] = m[6]*m2[1] + m[7]*m2[4] + m[8]*m2[7]; ans[8] = m[6]*m2[2] + m[7]*m2[5] + m[8]*m2[8]; }; /* ---------------------------------------------------------------------- Apply principal rotation generator about x to rotation matrix m ------------------------------------------------------------------------- */ ucl_inline void gpu_rotation_generator_x(const numtyp m[9], numtyp ans[9]) { ans[0] = 0; ans[1] = -m[2]; ans[2] = m[1]; ans[3] = 0; ans[4] = -m[5]; ans[5] = m[4]; ans[6] = 0; ans[7] = -m[8]; ans[8] = m[7]; }; /* ---------------------------------------------------------------------- Apply principal rotation generator about y to rotation matrix m ------------------------------------------------------------------------- */ ucl_inline void gpu_rotation_generator_y(const numtyp m[9], numtyp ans[9]) { ans[0] = m[2]; ans[1] = 0; ans[2] = -m[0]; ans[3] = m[5]; ans[4] = 0; ans[5] = -m[3]; ans[6] = m[8]; ans[7] = 0; ans[8] = -m[6]; }; /* ---------------------------------------------------------------------- Apply principal rotation generator about z to rotation matrix m ------------------------------------------------------------------------- */ ucl_inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9]) { ans[0] = -m[1]; ans[1] = m[0]; ans[2] = 0; ans[3] = -m[4]; ans[4] = m[3]; ans[5] = 0; ans[6] = -m[7]; ans[7] = m[6]; ans[8] = 0; }; /* ---------------------------------------------------------------------- matrix times vector ------------------------------------------------------------------------- */ ucl_inline void gpu_times_column3(const numtyp m[9], const numtyp v[3], numtyp ans[3]) { ans[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2]; ans[1] = m[3]*v[0] + m[4]*v[1] + m[5]*v[2]; ans[2] = m[6]*v[0] + m[7]*v[1] + m[8]*v[2]; }; #endif