/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov See the README file in the top-level LAMMPS directory. ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Theoretical Physics II, University of Technology Ilmenau, Germany See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ #ifndef CRM_CUDA_UTILS #define CRM_CUDA_UTILS //split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false) { int3 gridparams; int sharedsize = 16000; if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax; if((n < 60 * 32) || (threadsmax < 64)) gridparams.z = 32; else if((n < 60 * 64) || (threadsmax < 128)) gridparams.z = 64; else if((n < 60 * 128) || (threadsmax < 256)) gridparams.z = 128; else if((n < 60 * 256) || (threadsmax < 512)) gridparams.z = 256; else gridparams.z = 512; if(p2) { gridparams.z = 16; while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2; } int blocks = (n + gridparams.z - 1) / gridparams.z; if(blocks > 10000) gridparams.x = gridparams.y = int(sqrt(blocks)); else { gridparams.x = blocks; gridparams.y = 1; } while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++; if(gridparams.x == 0) gridparams.x = 1; return gridparams; } //return value: 1 if f<0; else: 0 //take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int static inline __device__ int negativCUDA(float f) { return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31; } //return value: -1 if f<0; else +1 static inline __device__ float fsignCUDA(float f) { return f < 0.0f ? -1.0f : 1.0f; } //functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights) //blockDim.y and blockDim.z are assumed to be 1 static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n) { int i, k; k = n - blockDim.x; for(i = 0; i < k; i += blockDim.x) { glob[i + threadIdx.x] = shared[i + threadIdx.x]; } if(threadIdx.x < n - i) { glob[i + threadIdx.x] = shared[i + threadIdx.x]; } __syncthreads(); } static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n) { int i, k; k = n - blockDim.x; for(i = 0; i < k; i += blockDim.x) { glob[i + threadIdx.x] = shared[i + threadIdx.x]; } if(threadIdx.x < n - i) { glob[i + threadIdx.x] = shared[i + threadIdx.x]; } __syncthreads(); } static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n) { int i, k; k = n - blockDim.x; for(i = 0; i < k; i += blockDim.x) { glob[i + threadIdx.x] = shared[i + threadIdx.x]; } if(threadIdx.x < n - i) { glob[i + threadIdx.x] = shared[i + threadIdx.x]; } __syncthreads(); } static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n) { int i, k; k = n - blockDim.x; for(i = 0; i < k; i += blockDim.x) { shared[i + threadIdx.x] = glob[i + threadIdx.x]; } if(threadIdx.x < n - i) { shared[i + threadIdx.x] = glob[i + threadIdx.x]; } __syncthreads(); } static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n) { int i, k; k = n - blockDim.x; for(i = 0; i < k; i += blockDim.x) { shared[i + threadIdx.x] = glob[i + threadIdx.x]; } if(threadIdx.x < n - i) { shared[i + threadIdx.x] = glob[i + threadIdx.x]; } __syncthreads(); } static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n) { int i; for(i = 0; i < n - blockDim.x; i += blockDim.x) { shared[i + threadIdx.x] = glob[i + threadIdx.x]; } if(threadIdx.x < n - i) { shared[i + threadIdx.x] = glob[i + threadIdx.x]; } __syncthreads(); } //copy data between two memory areas on device, 3d BlockDims are allowed static __device__ inline void copyData(double* source, double* target, const int &n) { int i; int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z; for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) { target[i + offset] = source[i + offset]; } if(offset < n - i) { target[i + offset] = source[i + offset]; } __syncthreads(); } static __device__ inline void copyData(float* source, float* target, const int &n) { int i; int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z; for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) { target[i + offset] = source[i + offset]; } if(offset < n - i) { target[i + offset] = source[i + offset]; } __syncthreads(); } static __device__ inline void copyData(int* source, int* target, const int &n) { int i; int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z; for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) { target[i + offset] = source[i + offset]; } if(offset < n - i) { target[i + offset] = source[i + offset]; } __syncthreads(); } static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n) { int i; int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z; for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) { target[i + offset] = source[i + offset]; } if(offset < n - i) { target[i + offset] = source[i + offset]; } __syncthreads(); } //functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined //in the end in data[0]=sum_i=0^blockDim.x data[i] //for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1 static __device__ inline void reduceBlockP2(int* data) { __syncthreads(); for(int i = 2; i <= blockDim.x; i *= 2) { if(threadIdx.x < blockDim.x / i) data[threadIdx.x] += data[threadIdx.x + blockDim.x / i]; __syncthreads(); } } static __device__ inline void reduceBlockP2(unsigned int* data) { __syncthreads(); for(int i = 2; i <= blockDim.x; i *= 2) { if(threadIdx.x < blockDim.x / i) data[threadIdx.x] += data[threadIdx.x + blockDim.x / i]; __syncthreads(); } } static __device__ inline void reduceBlockP2(float* data) { __syncthreads(); for(int i = 2; i <= blockDim.x; i *= 2) { if(threadIdx.x < blockDim.x / i) data[threadIdx.x] += data[threadIdx.x + blockDim.x / i]; __syncthreads(); } } static __device__ inline void reduceBlockP2(double* data) { __syncthreads(); for(int i = 2; i <= blockDim.x; i *= 2) { if(threadIdx.x < blockDim.x / i) data[threadIdx.x] += data[threadIdx.x + blockDim.x / i]; __syncthreads(); } } static __device__ inline void reduceBlock(float* data) { __syncthreads(); int p2 = 1; while(p2 * 2 < blockDim.x) p2 *= 2; if(threadIdx.x < blockDim.x - p2) data[threadIdx.x] += data[threadIdx.x + p2]; __syncthreads(); for(int i = 2; i <= p2; i *= 2) { if(threadIdx.x < p2 / i) data[threadIdx.x] += data[threadIdx.x + p2 / i]; __syncthreads(); } } static __device__ inline void reduceBlock(int* data) { __syncthreads(); int p2 = 1; while(p2 * 2 < blockDim.x) p2 *= 2; if(threadIdx.x < blockDim.x - p2) data[threadIdx.x] += data[threadIdx.x + p2]; __syncthreads(); for(int i = 2; i <= p2; i *= 2) { if(threadIdx.x < p2 / i) data[threadIdx.x] += data[threadIdx.x + p2 / i]; __syncthreads(); } } static __device__ inline void reduceBlock(unsigned int* data) { __syncthreads(); int p2 = 1; while(p2 * 2 < blockDim.x) p2 *= 2; if(threadIdx.x < blockDim.x - p2) data[threadIdx.x] += data[threadIdx.x + p2]; __syncthreads(); for(int i = 2; i <= p2; i *= 2) { if(threadIdx.x < p2 / i) data[threadIdx.x] += data[threadIdx.x + p2 / i]; __syncthreads(); } } static __device__ inline void reduceBlock(double* data) { __syncthreads(); int p2 = 1; while(p2 * 2 < blockDim.x) p2 *= 2; if(threadIdx.x < blockDim.x - p2) data[threadIdx.x] += data[threadIdx.x + p2]; __syncthreads(); for(int i = 2; i <= p2; i *= 2) { if(threadIdx.x < p2 / i) data[threadIdx.x] += data[threadIdx.x + p2 / i]; __syncthreads(); } } static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value) { int i; for(i = 0; i < n - blockDim.x; i += blockDim.x) { data[i + threadIdx.x] = value; } if(threadIdx.x < n - i) data[i + threadIdx.x] = value; } static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value) { int i; for(i = 0; i < n - blockDim.x; i += blockDim.x) { data[i + threadIdx.x] = value; } if(threadIdx.x < n - i) data[i + threadIdx.x] = value; } static __device__ inline void reduce(float* data, int n) //cautious not sure if working { __syncthreads(); int p2 = 1; while(p2 * 2 < n) p2 *= 2; int j = 0; while((threadIdx.x + blockDim.x * j) * 2 < n - p2) { data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2]; j++; } __syncthreads(); for(int i = 2; i <= p2; i *= 2) { while((threadIdx.x + blockDim.x * j) < p2 / i) { data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i]; j++; } __syncthreads(); } } static __device__ inline void reduce(double* data, int n) //cautious not sure if working { __syncthreads(); int p2 = 1; while(p2 * 2 < n) p2 *= 2; int j = 0; while((threadIdx.x + blockDim.x * j) * 2 < n - p2) { data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2]; j++; } __syncthreads(); for(int i = 2; i <= p2; i *= 2) { while((threadIdx.x + blockDim.x * j) < p2 / i) { data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i]; j++; } __syncthreads(); } } static __device__ inline void minOfBlock(float* data) { __syncthreads(); int p2 = 1; while(p2 * 2 < blockDim.x) p2 *= 2; if(threadIdx.x < blockDim.x - p2) data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]); __syncthreads(); for(int i = 2; i <= p2; i *= 2) { if(threadIdx.x < p2 / i) data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]); __syncthreads(); } } static __device__ inline void maxOfBlock(float* data) { __syncthreads(); int p2 = 1; while(p2 * 2 < blockDim.x) p2 *= 2; if(threadIdx.x < blockDim.x - p2) data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]); __syncthreads(); for(int i = 2; i <= p2; i *= 2) { if(threadIdx.x < p2 / i) data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]); __syncthreads(); } } static __device__ inline void minOfBlock(double* data) { __syncthreads(); int p2 = 1; while(p2 * 2 < blockDim.x) p2 *= 2; if(threadIdx.x < blockDim.x - p2) data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]); __syncthreads(); for(int i = 2; i <= p2; i *= 2) { if(threadIdx.x < p2 / i) data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]); __syncthreads(); } } static __device__ inline void maxOfBlock(double* data) { __syncthreads(); int p2 = 1; while(p2 * 2 < blockDim.x) p2 *= 2; if(threadIdx.x < blockDim.x - p2) data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]); __syncthreads(); for(int i = 2; i <= p2; i *= 2) { if(threadIdx.x < p2 / i) data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]); __syncthreads(); } } static __device__ inline void minOfData(double* data, int n) //cautious not sure if working { __syncthreads(); int p2 = 1; while(p2 * 2 < n) p2 *= 2; int j = 0; while((threadIdx.x + blockDim.x * j) < n - p2) { data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]); j++; } __syncthreads(); for(int i = 2; i <= p2; i *= 2) { while((threadIdx.x + blockDim.x * j) < p2 / i) { data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]); j++; } __syncthreads(); } } static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working { __syncthreads(); int p2 = 1; while(p2 * 2 < n) p2 *= 2; int j = 0; while((threadIdx.x + blockDim.x * j) < n - p2) { data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]); j++; } __syncthreads(); for(int i = 2; i <= p2; i *= 2) { while((threadIdx.x + blockDim.x * j) < p2 / i) { data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]); j++; } __syncthreads(); } } static __device__ inline void minOfData(float* data, int n) //cautious not sure if working { __syncthreads(); int p2 = 1; while(p2 * 2 < n) p2 *= 2; int j = 0; while((threadIdx.x + blockDim.x * j) < n - p2) { data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]); j++; } __syncthreads(); for(int i = 2; i <= p2; i *= 2) { while((threadIdx.x + blockDim.x * j) < p2 / i) { data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]); j++; } __syncthreads(); } } static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working { __syncthreads(); int p2 = 1; while(p2 * 2 < n) p2 *= 2; int j = 0; while((threadIdx.x + blockDim.x * j) < n - p2) { data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]); j++; } __syncthreads(); for(int i = 2; i <= p2; i *= 2) { while((threadIdx.x + blockDim.x * j) < p2 / i) { data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]); j++; } __syncthreads(); } } #if X_PRECISION == 2 static __device__ inline double tex1Dfetch_double(texture t, int i) { int2 v = tex1Dfetch(t, i); return __hiloint2double(v.y, v.x); } static __device__ inline X_FLOAT4 tex1Dfetch_double(texture t, int i) { int4 v = tex1Dfetch(t, 2 * i); int4 u = tex1Dfetch(t, 2 * i + 1); X_FLOAT4 w; w.x = __hiloint2double(v.y, v.x); w.y = __hiloint2double(v.w, v.z); w.z = __hiloint2double(u.y, u.x); w.w = __hiloint2double(u.w, u.z); return w; } #endif inline void BindXTypeTexture(cuda_shared_data* sdata) { #ifdef CUDA_USE_TEXTURE _x_type_tex.normalized = false; // access with normalized texture coordinates _x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no _x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex); #if X_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4)); #endif #endif } static __device__ inline X_FLOAT4 fetchXType(int i) { #ifdef CUDA_USE_TEXTURE #if X_PRECISION == 1 return tex1Dfetch(_x_type_tex, i); #else return tex1Dfetch_double(_x_type_tex, i); #endif #else return _x_type[i]; #endif } #if V_PRECISION == 2 static __device__ inline double tex1Dfetch_double_v(texture t, int i) { int2 v = tex1Dfetch(t, i); return __hiloint2double(v.y, v.x); } static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture t, int i) { int4 v = tex1Dfetch(t, 2 * i); int4 u = tex1Dfetch(t, 2 * i + 1); V_FLOAT4 w; w.x = __hiloint2double(v.y, v.x); w.y = __hiloint2double(v.w, v.z); w.z = __hiloint2double(u.y, u.x); w.w = __hiloint2double(u.w, u.z); return w; } #endif inline void BindVRadiusTexture(cuda_shared_data* sdata) { #ifdef CUDA_USE_TEXTURE _v_radius_tex.normalized = false; // access with normalized texture coordinates _v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no _v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex); #if V_PRECISION == 1 cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4)); #else cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4)); #endif #endif } static __device__ inline V_FLOAT4 fetchVRadius(int i) { #ifdef CUDA_USE_TEXTURE #if V_PRECISION == 1 return tex1Dfetch(_v_radius_tex, i); #else return tex1Dfetch_double_v(_v_radius_tex, i); #endif #else return _v_radius[i]; #endif } inline void BindOmegaRmassTexture(cuda_shared_data* sdata) { #ifdef CUDA_USE_TEXTURE _omega_rmass_tex.normalized = false; // access with normalized texture coordinates _omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex); #if V_PRECISION == 1 cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4)); #else cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4)); #endif #endif } static __device__ inline V_FLOAT4 fetchOmegaRmass(int i) { #ifdef CUDA_USE_TEXTURE #if V_PRECISION == 1 return tex1Dfetch(_omega_rmass_tex, i); #else return tex1Dfetch_double_v(_omega_rmass_tex, i); #endif #else return _omega_rmass[i]; #endif } #if F_PRECISION == 2 static __device__ inline double tex1Dfetch_double_f(texture t, int i) { int2 v = tex1Dfetch(t, i); return __hiloint2double(v.y, v.x); } static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture t, int i) { int4 v = tex1Dfetch(t, 2 * i); int4 u = tex1Dfetch(t, 2 * i + 1); F_FLOAT4 w; w.x = __hiloint2double(v.y, v.x); w.y = __hiloint2double(v.w, v.z); w.z = __hiloint2double(u.y, u.x); w.w = __hiloint2double(u.w, u.z); return w; } #endif inline void BindQTexture(cuda_shared_data* sdata) { #ifdef CUDA_USE_TEXTURE _q_tex.normalized = false; // access with normalized texture coordinates _q_tex.filterMode = cudaFilterModePoint; // Point mode, so no _q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates const textureReference* q_texture_ptr = &MY_AP(q_tex); #if F_PRECISION == 1 cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT)); #else cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2)); #endif #endif } static __device__ inline F_FLOAT fetchQ(int i) { #ifdef CUDA_USE_TEXTURE #if F_PRECISION == 1 return tex1Dfetch(_q_tex, i); #else return tex1Dfetch_double_f(_q_tex, i); #endif #else return _q[i]; #endif } #endif /* inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex) { #ifdef CUDA_USE_TEXTURE _coeff_tex.normalized = false; // access with normalized texture coordinates _coeff_tex.filterMode = cudaFilterModePoint; // Point mode, so no _coeff_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates const textureReference* coeff_texture_ptr; cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex)); #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4)); #endif #endif } static __device__ inline X_FLOAT4 fetchXType(int i) { #ifdef CUDA_USE_TEXTURE #if X_PRECISION == 1 return tex1Dfetch(_x_type_tex,i); #else return tex1Dfetch_double(_x_type_tex,i); #endif #else return _x_type[i]; #endif } */ #define SBBITS 30 static inline __device__ int sbmask(int j) { return j >> SBBITS & 3; } static inline __device__ void minimum_image(X_FLOAT4 &delta) { if(_triclinic == 0) { if(_periodicity[0]) { delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] : (delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0)); } if(_periodicity[1]) { delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] : (delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0)); } if(_periodicity[2]) { delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] : (delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0)); } } else { if(_periodicity[1]) { delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] : (delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0)); delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] : (delta.z > X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0)); delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] : (delta.z > X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0)); } if(_periodicity[1]) { delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] : (delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0)); delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] : (delta.y > X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0)); } if(_periodicity[0]) { delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] : (delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0)); } } } static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci) { ci.x = x2.x - x1.x; ci.y = x2.y - x1.y; ci.z = x2.z - x1.z; minimum_image(ci); ci.x += x1.x; ci.y += x1.y; ci.z += x1.z; }