/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov See the README file in the top-level LAMMPS directory. ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Theoretical Physics II, University of Technology Ilmenau, Germany See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ #ifndef CRM_CUDA_UTILS #define CRM_CUDA_UTILS //split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) inline int3 getgrid(int n,int shared_per_thread=0,int threadsmax=256, bool p2=false) { int3 gridparams; int sharedsize=16000; if(shared_per_thread>0) threadsmax= sharedsize/shared_per_thread10000) gridparams.x=gridparams.y=int(sqrt(blocks)); else {gridparams.x=blocks; gridparams.y=1;} while(gridparams.x*gridparams.y*gridparams.z>31; } //return value: -1 if f<0; else +1 static inline __device__ float fsignCUDA(float f) { return f<0.0f?-1.0f:1.0f; } //functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights) //blockDim.y and blockDim.z are assumed to be 1 static inline __device__ void copySharedToGlob(int* shared, int* glob,const int& n) { int i,k; k=n-blockDim.x; for(i=0;i t, int i) { int2 v = tex1Dfetch(t,i); return __hiloint2double(v.y, v.x); } static __device__ inline X_FLOAT4 tex1Dfetch_double(texture t, int i) { int4 v = tex1Dfetch(t,2*i); int4 u = tex1Dfetch(t,2*i+1); X_FLOAT4 w; w.x= __hiloint2double(v.y, v.x); w.y= __hiloint2double(v.w, v.z); w.z= __hiloint2double(u.y, u.x); w.w= __hiloint2double(u.w, u.z); return w; } #endif inline void BindXTypeTexture(cuda_shared_data* sdata) { #ifdef CUDA_USE_TEXTURE _x_type_tex.normalized = false; // access with normalized texture coordinates _x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no _x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates const textureReference* x_type_texture_ptr; cudaGetTextureReference(&x_type_texture_ptr, MY_CONST(x_type_tex)); #if X_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4)); #endif #endif } static __device__ inline X_FLOAT4 fetchXType(int i) { #ifdef CUDA_USE_TEXTURE #if X_PRECISION == 1 return tex1Dfetch(_x_type_tex,i); #else return tex1Dfetch_double(_x_type_tex,i); #endif #else return _x_type[i]; #endif } #if V_PRECISION == 2 static __device__ inline double tex1Dfetch_double_v(texture t, int i) { int2 v = tex1Dfetch(t,i); return __hiloint2double(v.y, v.x); } static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture t, int i) { int4 v = tex1Dfetch(t,2*i); int4 u = tex1Dfetch(t,2*i+1); V_FLOAT4 w; w.x= __hiloint2double(v.y, v.x); w.y= __hiloint2double(v.w, v.z); w.z= __hiloint2double(u.y, u.x); w.w= __hiloint2double(u.w, u.z); return w; } #endif inline void BindVRadiusTexture(cuda_shared_data* sdata) { #ifdef CUDA_USE_TEXTURE _v_radius_tex.normalized = false; // access with normalized texture coordinates _v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no _v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates const textureReference* v_radius_texture_ptr; cudaGetTextureReference(&v_radius_texture_ptr, MY_CONST(v_radius_tex)); #if V_PRECISION == 1 cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); cudaBindTexture(0,v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax*sizeof(X_FLOAT4)); #else cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); cudaBindTexture(0,v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax*2*sizeof(int4)); #endif #endif } static __device__ inline V_FLOAT4 fetchVRadius(int i) { #ifdef CUDA_USE_TEXTURE #if V_PRECISION == 1 return tex1Dfetch(_v_radius_tex,i); #else return tex1Dfetch_double_v(_v_radius_tex,i); #endif #else return _v_radius[i]; #endif } inline void BindOmegaRmassTexture(cuda_shared_data* sdata) { #ifdef CUDA_USE_TEXTURE _omega_rmass_tex.normalized = false; // access with normalized texture coordinates _omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates const textureReference* omega_rmass_texture_ptr; cudaGetTextureReference(&omega_rmass_texture_ptr, MY_CONST(omega_rmass_tex)); #if V_PRECISION == 1 cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); cudaBindTexture(0,omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax*sizeof(X_FLOAT4)); #else cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); cudaBindTexture(0,omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax*2*sizeof(int4)); #endif #endif } static __device__ inline V_FLOAT4 fetchOmegaRmass(int i) { #ifdef CUDA_USE_TEXTURE #if V_PRECISION == 1 return tex1Dfetch(_omega_rmass_tex,i); #else return tex1Dfetch_double_v(_omega_rmass_tex,i); #endif #else return _omega_rmass[i]; #endif } #if F_PRECISION == 2 static __device__ inline double tex1Dfetch_double_f(texture t, int i) { int2 v = tex1Dfetch(t,i); return __hiloint2double(v.y, v.x); } static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture t, int i) { int4 v = tex1Dfetch(t,2*i); int4 u = tex1Dfetch(t,2*i+1); F_FLOAT4 w; w.x= __hiloint2double(v.y, v.x); w.y= __hiloint2double(v.w, v.z); w.z= __hiloint2double(u.y, u.x); w.w= __hiloint2double(u.w, u.z); return w; } #endif inline void BindQTexture(cuda_shared_data* sdata) { #ifdef CUDA_USE_TEXTURE _q_tex.normalized = false; // access with normalized texture coordinates _q_tex.filterMode = cudaFilterModePoint; // Point mode, so no _q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates const textureReference* q_texture_ptr; cudaGetTextureReference(&q_texture_ptr, MY_CONST(q_tex)); #if F_PRECISION == 1 cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); cudaBindTexture(0,q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax*sizeof(F_FLOAT)); #else cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); cudaBindTexture(0,q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax*sizeof(int2)); #endif #endif } static __device__ inline F_FLOAT fetchQ(int i) { #ifdef CUDA_USE_TEXTURE #if F_PRECISION == 1 return tex1Dfetch(_q_tex,i); #else return tex1Dfetch_double_f(_q_tex,i); #endif #else return _q[i]; #endif } #endif /* inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex) { #ifdef CUDA_USE_TEXTURE _coeff_tex.normalized = false; // access with normalized texture coordinates _coeff_tex.filterMode = cudaFilterModePoint; // Point mode, so no _coeff_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates const textureReference* coeff_texture_ptr; cudaGetTextureReference(&coeff_texture_ptr, MY_CONST(coeff_tex)); #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4)); #endif #endif } static __device__ inline X_FLOAT4 fetchXType(int i) { #ifdef CUDA_USE_TEXTURE #if X_PRECISION == 1 return tex1Dfetch(_x_type_tex,i); #else return tex1Dfetch_double(_x_type_tex,i); #endif #else return _x_type[i]; #endif } */ #define SBBITS 30 static inline __device__ int sbmask(int j) { return j >> SBBITS & 3; } static inline __device__ void minimum_image(X_FLOAT4& delta) { if (_triclinic == 0) { if (_periodicity[0]) { delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] : (delta.x > X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0)); } if (_periodicity[1]) { delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] : (delta.y > X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0)); } if (_periodicity[2]) { delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] : (delta.z > X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0)); } } else { if (_periodicity[1]) { delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] : (delta.z > X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0)); delta.y += delta.z < -X_F(0.5)*_prd[2] ? _h[3] : (delta.z > X_F(0.5)*_prd[2] ?-_h[3] : X_F(0.0)); delta.x += delta.z < -X_F(0.5)*_prd[2] ? _h[4] : (delta.z > X_F(0.5)*_prd[2] ?-_h[4] : X_F(0.0)); } if (_periodicity[1]) { delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] : (delta.y > X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0)); delta.x += delta.y < -X_F(0.5)*_prd[1] ? _h[5] : (delta.y > X_F(0.5)*_prd[1] ?-_h[5] : X_F(0.0)); } if (_periodicity[0]) { delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] : (delta.x > X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0)); } } } static inline __device__ void closest_image(X_FLOAT4& x1,X_FLOAT4& x2,X_FLOAT4& ci) { ci.x=x2.x-x1.x; ci.y=x2.y-x1.y; ci.z=x2.z-x1.z; minimum_image(ci); ci.x+=x1.x; ci.y+=x1.y; ci.z+=x1.z; }