diff --git a/lib/cuda/Makefile.common b/lib/cuda/Makefile.common index 1952fad37b..d98f75f3b5 100644 --- a/lib/cuda/Makefile.common +++ b/lib/cuda/Makefile.common @@ -14,7 +14,8 @@ SHELL = /bin/sh # System-specific settings -CUDA_INSTALL_PATH = /usr/local/cuda +#CUDA_INSTALL_PATH = /usr/local/cuda +CUDA_INSTALL_PATH = /home/crtrott/lib/cuda # e.g. in Gentoo # CUDA_INSTALL_PATH = /opt/cuda @@ -97,8 +98,22 @@ else NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false SMVERSIONFLAGS := -arch sm_21 else - CUDA_FLAGS += -DCUDA_ARCH=99 - SMVERSIONFLAGS := -arch sm_13 + ifeq ($(strip $(arch)), 30) + CUDA_FLAGS += -DCUDA_ARCH=20 + #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true + NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false + SMVERSIONFLAGS := -arch sm_30 + else + ifeq ($(strip $(arch)), 35) + CUDA_FLAGS += -DCUDA_ARCH=20 + #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true + NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false + SMVERSIONFLAGS := -arch sm_35 + else + CUDA_FLAGS += -DCUDA_ARCH=99 + SMVERSIONFLAGS := -arch sm_13 + endif + endif endif endif endif diff --git a/lib/cuda/crm_cuda_utils.cu b/lib/cuda/crm_cuda_utils.cu index be66bb417a..6337d0d015 100644 --- a/lib/cuda/crm_cuda_utils.cu +++ b/lib/cuda/crm_cuda_utils.cu @@ -1,22 +1,22 @@ /* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + Steve Plimpton, sjplimp@sandia.gov - See the README file in the top-level LAMMPS directory. + See the README file in the top-level LAMMPS directory. - ----------------------------------------------------------------------- + ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ + https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany + Theoretical Physics II, University of Technology Ilmenau, Germany - See the README file in the USER-CUDA directory. + See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ @@ -28,36 +28,43 @@ #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) -inline int3 getgrid(int n,int shared_per_thread=0,int threadsmax=256, bool p2=false) +inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false) { int3 gridparams; - int sharedsize=16000; - if(shared_per_thread>0) threadsmax= sharedsize/shared_per_thread 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax; + + if((n < 60 * 32) || (threadsmax < 64)) + gridparams.z = 32; + else if((n < 60 * 64) || (threadsmax < 128)) + gridparams.z = 64; + else if((n < 60 * 128) || (threadsmax < 256)) + gridparams.z = 128; + else if((n < 60 * 256) || (threadsmax < 512)) + gridparams.z = 256; + else gridparams.z = 512; + + if(p2) { + gridparams.z = 16; + + while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2; + } + + + int blocks = (n + gridparams.z - 1) / gridparams.z; + + if(blocks > 10000) + gridparams.x = gridparams.y = int(sqrt(blocks)); + else { + gridparams.x = blocks; + gridparams.y = 1; + } + + while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++; + + if(gridparams.x == 0) gridparams.x = 1; - int blocks=(n+gridparams.z-1)/gridparams.z; - if(blocks>10000) - gridparams.x=gridparams.y=int(sqrt(blocks)); - else - {gridparams.x=blocks; gridparams.y=1;} - while(gridparams.x*gridparams.y*gridparams.z>31; + return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31; } -//return value: -1 if f<0; else +1 +//return value: -1 if f<0; else +1 static inline __device__ float fsignCUDA(float f) { - return f<0.0f?-1.0f:1.0f; + return f < 0.0f ? -1.0f : 1.0f; } //functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights) //blockDim.y and blockDim.z are assumed to be 1 -static inline __device__ void copySharedToGlob(int* shared, int* glob,const int& n) +static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n) { - int i,k; - k=n-blockDim.x; - for(i=0;i t, int i) { - int2 v = tex1Dfetch(t,i); - return __hiloint2double(v.y, v.x); + int2 v = tex1Dfetch(t, i); + return __hiloint2double(v.y, v.x); } static __device__ inline X_FLOAT4 tex1Dfetch_double(texture t, int i) { - int4 v = tex1Dfetch(t,2*i); - int4 u = tex1Dfetch(t,2*i+1); - X_FLOAT4 w; - - w.x= __hiloint2double(v.y, v.x); - w.y= __hiloint2double(v.w, v.z); - w.z= __hiloint2double(u.y, u.x); - w.w= __hiloint2double(u.w, u.z); - return w; + int4 v = tex1Dfetch(t, 2 * i); + int4 u = tex1Dfetch(t, 2 * i + 1); + X_FLOAT4 w; + + w.x = __hiloint2double(v.y, v.x); + w.y = __hiloint2double(v.w, v.z); + w.z = __hiloint2double(u.y, u.x); + w.w = __hiloint2double(u.w, u.z); + return w; } #endif inline void BindXTypeTexture(cuda_shared_data* sdata) { - #ifdef CUDA_USE_TEXTURE - _x_type_tex.normalized = false; // access with normalized texture coordinates - _x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* x_type_texture_ptr; - cudaGetTextureReference(&x_type_texture_ptr, MY_CONST(x_type_tex)); +#ifdef CUDA_USE_TEXTURE + _x_type_tex.normalized = false; // access with normalized texture coordinates + _x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex); - #if X_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4)); - #else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4)); - #endif - #endif +#if X_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4)); +#endif +#endif } static __device__ inline X_FLOAT4 fetchXType(int i) { - #ifdef CUDA_USE_TEXTURE - #if X_PRECISION == 1 - return tex1Dfetch(_x_type_tex,i); - #else - return tex1Dfetch_double(_x_type_tex,i); - #endif - #else - return _x_type[i]; - #endif +#ifdef CUDA_USE_TEXTURE +#if X_PRECISION == 1 + return tex1Dfetch(_x_type_tex, i); +#else + return tex1Dfetch_double(_x_type_tex, i); +#endif +#else + return _x_type[i]; +#endif } #if V_PRECISION == 2 static __device__ inline double tex1Dfetch_double_v(texture t, int i) { - int2 v = tex1Dfetch(t,i); - return __hiloint2double(v.y, v.x); + int2 v = tex1Dfetch(t, i); + return __hiloint2double(v.y, v.x); } static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture t, int i) { - int4 v = tex1Dfetch(t,2*i); - int4 u = tex1Dfetch(t,2*i+1); - V_FLOAT4 w; - - w.x= __hiloint2double(v.y, v.x); - w.y= __hiloint2double(v.w, v.z); - w.z= __hiloint2double(u.y, u.x); - w.w= __hiloint2double(u.w, u.z); - return w; + int4 v = tex1Dfetch(t, 2 * i); + int4 u = tex1Dfetch(t, 2 * i + 1); + V_FLOAT4 w; + + w.x = __hiloint2double(v.y, v.x); + w.y = __hiloint2double(v.w, v.z); + w.z = __hiloint2double(u.y, u.x); + w.w = __hiloint2double(u.w, u.z); + return w; } #endif inline void BindVRadiusTexture(cuda_shared_data* sdata) { - #ifdef CUDA_USE_TEXTURE - _v_radius_tex.normalized = false; // access with normalized texture coordinates - _v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* v_radius_texture_ptr; - cudaGetTextureReference(&v_radius_texture_ptr, MY_CONST(v_radius_tex)); +#ifdef CUDA_USE_TEXTURE + _v_radius_tex.normalized = false; // access with normalized texture coordinates + _v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex); - #if V_PRECISION == 1 - cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); - cudaBindTexture(0,v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax*sizeof(X_FLOAT4)); - #else - cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); - cudaBindTexture(0,v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax*2*sizeof(int4)); - #endif - #endif +#if V_PRECISION == 1 + cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); + cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4)); +#else + cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); + cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4)); +#endif +#endif } static __device__ inline V_FLOAT4 fetchVRadius(int i) { - #ifdef CUDA_USE_TEXTURE - #if V_PRECISION == 1 - return tex1Dfetch(_v_radius_tex,i); - #else - return tex1Dfetch_double_v(_v_radius_tex,i); - #endif - #else - return _v_radius[i]; - #endif +#ifdef CUDA_USE_TEXTURE +#if V_PRECISION == 1 + return tex1Dfetch(_v_radius_tex, i); +#else + return tex1Dfetch_double_v(_v_radius_tex, i); +#endif +#else + return _v_radius[i]; +#endif } inline void BindOmegaRmassTexture(cuda_shared_data* sdata) { - #ifdef CUDA_USE_TEXTURE - _omega_rmass_tex.normalized = false; // access with normalized texture coordinates - _omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* omega_rmass_texture_ptr; - cudaGetTextureReference(&omega_rmass_texture_ptr, MY_CONST(omega_rmass_tex)); +#ifdef CUDA_USE_TEXTURE + _omega_rmass_tex.normalized = false; // access with normalized texture coordinates + _omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex); - #if V_PRECISION == 1 - cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); - cudaBindTexture(0,omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax*sizeof(X_FLOAT4)); - #else - cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); - cudaBindTexture(0,omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax*2*sizeof(int4)); - #endif - #endif +#if V_PRECISION == 1 + cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); + cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4)); +#else + cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); + cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4)); +#endif +#endif } static __device__ inline V_FLOAT4 fetchOmegaRmass(int i) { - #ifdef CUDA_USE_TEXTURE - #if V_PRECISION == 1 - return tex1Dfetch(_omega_rmass_tex,i); - #else - return tex1Dfetch_double_v(_omega_rmass_tex,i); - #endif - #else - return _omega_rmass[i]; - #endif +#ifdef CUDA_USE_TEXTURE +#if V_PRECISION == 1 + return tex1Dfetch(_omega_rmass_tex, i); +#else + return tex1Dfetch_double_v(_omega_rmass_tex, i); +#endif +#else + return _omega_rmass[i]; +#endif } #if F_PRECISION == 2 static __device__ inline double tex1Dfetch_double_f(texture t, int i) { - int2 v = tex1Dfetch(t,i); - return __hiloint2double(v.y, v.x); + int2 v = tex1Dfetch(t, i); + return __hiloint2double(v.y, v.x); } static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture t, int i) { - int4 v = tex1Dfetch(t,2*i); - int4 u = tex1Dfetch(t,2*i+1); - F_FLOAT4 w; - - w.x= __hiloint2double(v.y, v.x); - w.y= __hiloint2double(v.w, v.z); - w.z= __hiloint2double(u.y, u.x); - w.w= __hiloint2double(u.w, u.z); - return w; + int4 v = tex1Dfetch(t, 2 * i); + int4 u = tex1Dfetch(t, 2 * i + 1); + F_FLOAT4 w; + + w.x = __hiloint2double(v.y, v.x); + w.y = __hiloint2double(v.w, v.z); + w.z = __hiloint2double(u.y, u.x); + w.w = __hiloint2double(u.w, u.z); + return w; } #endif inline void BindQTexture(cuda_shared_data* sdata) { - #ifdef CUDA_USE_TEXTURE - _q_tex.normalized = false; // access with normalized texture coordinates - _q_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* q_texture_ptr; - cudaGetTextureReference(&q_texture_ptr, MY_CONST(q_tex)); +#ifdef CUDA_USE_TEXTURE + _q_tex.normalized = false; // access with normalized texture coordinates + _q_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* q_texture_ptr = &MY_AP(q_tex); - #if F_PRECISION == 1 - cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); - cudaBindTexture(0,q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax*sizeof(F_FLOAT)); - #else - cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); - cudaBindTexture(0,q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax*sizeof(int2)); - #endif - #endif +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); + cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); + cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2)); +#endif +#endif } static __device__ inline F_FLOAT fetchQ(int i) { - #ifdef CUDA_USE_TEXTURE - #if F_PRECISION == 1 - return tex1Dfetch(_q_tex,i); - #else - return tex1Dfetch_double_f(_q_tex,i); - #endif - #else - return _q[i]; - #endif +#ifdef CUDA_USE_TEXTURE +#if F_PRECISION == 1 + return tex1Dfetch(_q_tex, i); +#else + return tex1Dfetch_double_f(_q_tex, i); +#endif +#else + return _q[i]; +#endif } #endif @@ -776,7 +831,7 @@ inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex) _coeff_tex.filterMode = cudaFilterModePoint; // Point mode, so no _coeff_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates const textureReference* coeff_texture_ptr; - cudaGetTextureReference(&coeff_texture_ptr, MY_CONST(coeff_tex)); + cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex)); #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); @@ -798,62 +853,67 @@ static __device__ inline X_FLOAT4 fetchXType(int i) #endif #else return _x_type[i]; - #endif + #endif } */ #define SBBITS 30 -static inline __device__ int sbmask(int j) { +static inline __device__ int sbmask(int j) +{ return j >> SBBITS & 3; } -static inline __device__ void minimum_image(X_FLOAT4& delta) +static inline __device__ void minimum_image(X_FLOAT4 &delta) { - if (_triclinic == 0) { - if (_periodicity[0]) { - delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] : - (delta.x > X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0)); + if(_triclinic == 0) { + if(_periodicity[0]) { + delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] : + (delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0)); } - if (_periodicity[1]) { - delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] : - (delta.y > X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0)); + + if(_periodicity[1]) { + delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] : + (delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0)); } - if (_periodicity[2]) { - delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] : - (delta.z > X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0)); + + if(_periodicity[2]) { + delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] : + (delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0)); } } else { - if (_periodicity[1]) { - delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] : - (delta.z > X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0)); - delta.y += delta.z < -X_F(0.5)*_prd[2] ? _h[3] : - (delta.z > X_F(0.5)*_prd[2] ?-_h[3] : X_F(0.0)); - delta.x += delta.z < -X_F(0.5)*_prd[2] ? _h[4] : - (delta.z > X_F(0.5)*_prd[2] ?-_h[4] : X_F(0.0)); + if(_periodicity[1]) { + delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] : + (delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0)); + delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] : + (delta.z > X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0)); + delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] : + (delta.z > X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0)); } - if (_periodicity[1]) { - delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] : - (delta.y > X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0)); - delta.x += delta.y < -X_F(0.5)*_prd[1] ? _h[5] : - (delta.y > X_F(0.5)*_prd[1] ?-_h[5] : X_F(0.0)); + + if(_periodicity[1]) { + delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] : + (delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0)); + delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] : + (delta.y > X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0)); } - if (_periodicity[0]) { - delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] : - (delta.x > X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0)); + + if(_periodicity[0]) { + delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] : + (delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0)); } } } -static inline __device__ void closest_image(X_FLOAT4& x1,X_FLOAT4& x2,X_FLOAT4& ci) +static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci) { - ci.x=x2.x-x1.x; - ci.y=x2.y-x1.y; - ci.z=x2.z-x1.z; + ci.x = x2.x - x1.x; + ci.y = x2.y - x1.y; + ci.z = x2.z - x1.z; minimum_image(ci); - ci.x+=x1.x; - ci.y+=x1.y; - ci.z+=x1.z; + ci.x += x1.x; + ci.y += x1.y; + ci.z += x1.z; } diff --git a/lib/cuda/cuda_pair.cu b/lib/cuda/cuda_pair.cu index 0685ce4490..9f9900a2d8 100644 --- a/lib/cuda/cuda_pair.cu +++ b/lib/cuda/cuda_pair.cu @@ -1,28 +1,28 @@ /* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + Steve Plimpton, sjplimp@sandia.gov - See the README file in the top-level LAMMPS directory. + See the README file in the top-level LAMMPS directory. - ----------------------------------------------------------------------- + ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ + https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany + Theoretical Physics II, University of Technology Ilmenau, Germany - See the README file in the USER-CUDA directory. + See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -enum PAIR_FORCES {PAIR_NONE,PAIR_BORN,PAIR_BUCK,PAIR_CG_CMM,PAIR_LJ_CHARMM,PAIR_LJ_CLASS2,PAIR_LJ_CUT, PAIR_LJ_EXPAND, PAIR_LJ_GROMACS, PAIR_LJ_SMOOTH, PAIR_LJ96_CUT, PAIR_MORSE, PAIR_MORSE_R6}; -enum COUL_FORCES {COUL_NONE,COUL_CHARMM,COUL_CHARMM_IMPLICIT,COUL_CUT,COUL_LONG, COUL_DEBYE, COUL_GROMACS,COUL_SPECIAL}; +enum PAIR_FORCES {PAIR_NONE, PAIR_BORN, PAIR_BUCK, PAIR_CG_CMM, PAIR_LJ_CHARMM, PAIR_LJ_CLASS2, PAIR_LJ_CUT, PAIR_LJ_EXPAND, PAIR_LJ_GROMACS, PAIR_LJ_SMOOTH, PAIR_LJ96_CUT, PAIR_MORSE, PAIR_MORSE_R6}; +enum COUL_FORCES {COUL_NONE, COUL_CHARMM, COUL_CHARMM_IMPLICIT, COUL_CUT, COUL_LONG, COUL_DEBYE, COUL_GROMACS, COUL_SPECIAL}; #define DATA_NONE 0 #define DATA_V 1 #define DATA_TAG 2 @@ -80,75 +80,75 @@ __device__ __constant__ F_FLOAT* MY_AP(coeff8_gm); __device__ __constant__ F_FLOAT* MY_AP(coeff9_gm); __device__ __constant__ F_FLOAT* MY_AP(coeff10_gm); - #define _coeff1_gm_tex MY_AP(coeff1_gm_tex) - #if F_PRECISION == 1 - texture _coeff1_gm_tex; - #else - texture _coeff1_gm_tex; - #endif +#define _coeff1_gm_tex MY_AP(coeff1_gm_tex) +#if F_PRECISION == 1 +texture _coeff1_gm_tex; +#else +texture _coeff1_gm_tex; +#endif - #define _coeff2_gm_tex MY_AP(coeff2_gm_tex) - #if F_PRECISION == 1 - texture _coeff2_gm_tex; - #else - texture _coeff2_gm_tex; - #endif +#define _coeff2_gm_tex MY_AP(coeff2_gm_tex) +#if F_PRECISION == 1 +texture _coeff2_gm_tex; +#else +texture _coeff2_gm_tex; +#endif - #define _coeff3_gm_tex MY_AP(coeff3_gm_tex) - #if F_PRECISION == 1 - texture _coeff3_gm_tex; - #else - texture _coeff3_gm_tex; - #endif +#define _coeff3_gm_tex MY_AP(coeff3_gm_tex) +#if F_PRECISION == 1 +texture _coeff3_gm_tex; +#else +texture _coeff3_gm_tex; +#endif - #define _coeff4_gm_tex MY_AP(coeff4_gm_tex) - #if F_PRECISION == 1 - texture _coeff4_gm_tex; - #else - texture _coeff4_gm_tex; - #endif +#define _coeff4_gm_tex MY_AP(coeff4_gm_tex) +#if F_PRECISION == 1 +texture _coeff4_gm_tex; +#else +texture _coeff4_gm_tex; +#endif - #define _coeff5_gm_tex MY_AP(coeff5_gm_tex) - #if F_PRECISION == 1 - texture _coeff5_gm_tex; - #else - texture _coeff5_gm_tex; - #endif +#define _coeff5_gm_tex MY_AP(coeff5_gm_tex) +#if F_PRECISION == 1 +texture _coeff5_gm_tex; +#else +texture _coeff5_gm_tex; +#endif - #define _coeff6_gm_tex MY_AP(coeff6_gm_tex) - #if F_PRECISION == 1 - texture _coeff6_gm_tex; - #else - texture _coeff6_gm_tex; - #endif +#define _coeff6_gm_tex MY_AP(coeff6_gm_tex) +#if F_PRECISION == 1 +texture _coeff6_gm_tex; +#else +texture _coeff6_gm_tex; +#endif - #define _coeff7_gm_tex MY_AP(coeff7_gm_tex) - #if F_PRECISION == 1 - texture _coeff7_gm_tex; - #else - texture _coeff7_gm_tex; - #endif +#define _coeff7_gm_tex MY_AP(coeff7_gm_tex) +#if F_PRECISION == 1 +texture _coeff7_gm_tex; +#else +texture _coeff7_gm_tex; +#endif - #define _coeff8_gm_tex MY_AP(coeff8_gm_tex) - #if F_PRECISION == 1 - texture _coeff8_gm_tex; - #else - texture _coeff8_gm_tex; - #endif +#define _coeff8_gm_tex MY_AP(coeff8_gm_tex) +#if F_PRECISION == 1 +texture _coeff8_gm_tex; +#else +texture _coeff8_gm_tex; +#endif - #define _coeff9_gm_tex MY_AP(coeff9_gm_tex) - #if F_PRECISION == 1 - texture _coeff9_gm_tex; - #else - texture _coeff9_gm_tex; - #endif +#define _coeff9_gm_tex MY_AP(coeff9_gm_tex) +#if F_PRECISION == 1 +texture _coeff9_gm_tex; +#else +texture _coeff9_gm_tex; +#endif - #define _coeff10_gm_tex MY_AP(coeff10_gm_tex) - #if F_PRECISION == 1 - texture _coeff10_gm_tex; - #else - texture _coeff10_gm_tex; - #endif +#define _coeff10_gm_tex MY_AP(coeff10_gm_tex) +#if F_PRECISION == 1 +texture _coeff10_gm_tex; +#else +texture _coeff10_gm_tex; +#endif //if more than 5 coefficients are needed for a pair potential add them here @@ -172,17 +172,17 @@ __device__ __constant__ X_FLOAT _cut_innersq[CUDA_MAX_TYPES2]; __device__ __constant__ X_FLOAT _cut_innersq_global; -template -__global__ void Pair_Kernel_TpA(int eflag, int vflag,int eflag_atom,int vflag_atom); +template +__global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom); -template -__global__ void Pair_Kernel_BpA(int eflag, int vflag,int eflag_atom,int vflag_atom); +template +__global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom); -template -__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase); +template +__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase); -template -__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase); +template +__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase); #include #include "cuda_pair_cu.h" @@ -191,654 +191,635 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag,int eflag_atom,int vfla //Functions which are shared by pair styles //Update Buffersize -void Cuda_UpdateBuffer(cuda_shared_data* sdata,int size) +void Cuda_UpdateBuffer(cuda_shared_data* sdata, int size) { - CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles: before updateBuffer failed"); - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - } - cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); - CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles failed"); + CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles: before updateBuffer failed"); + + if(sdata->buffersize < size) { + MYDBG(printf("Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); + CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles failed"); } void Cuda_Pair_UpdateNeighbor_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) { //Neighbor - cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(overlap_comm) , & sdata->overlap_comm, sizeof(int) ); + cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(overlap_comm) , & sdata->overlap_comm, sizeof(int)); -if(sdata->overlap_comm) -{ - cudaMemcpyToSymbol(MY_CONST(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*) ); -} + if(sdata->overlap_comm) { + cudaMemcpyToSymbol(MY_AP(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*)); + } } -//Update constants after nmax change which are generally needed by all pair styles +//Update constants after nmax change which are generally needed by all pair styles void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) { - CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: Begin"); + CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: Begin"); - //System - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - - //Atom - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) ); + //System + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + + //Atom + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); - //Other - cudaMemcpyToSymbol(MY_CONST(debugdata) , & sdata->debugdata , sizeof(int*) ); - CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: End"); + //Other + cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata , sizeof(int*)); + CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: End"); } //Initialisation of GPU Constants which rarely change -void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=false, bool use_global_params=false, bool need_innercut=false, bool need_cut=true ) +void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = false, bool use_global_params = false, bool need_innercut = false, bool need_cut = true) { - unsigned cuda_ntypes = sdata->atom.ntypes + 1; - unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes; - unsigned n = sizeof(F_FLOAT) * cuda_ntypes2; - unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2; + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes; + unsigned n = sizeof(F_FLOAT) * cuda_ntypes2; + unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2; - //check if enough constant memory is available - if((cuda_ntypes2 > CUDA_MAX_TYPES2 )&& !use_global_params) - printf("# CUDA: Cuda_Pair_Init: you need %u types. this is more than %u " - "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 " - "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE-1); - if((cuda_ntypes2 > CUDA_MAX_TYPES2 )&& !use_global_params) - exit(0); - //type conversion of cutoffs and parameters - if(need_cut) - { - X_FLOAT cutsq[cuda_ntypes2]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_global * sdata->pair.cut_global); - } - } - - int cutsqdiffer=0; - X_FLOAT cutsq_global; - cutsq_global = (X_FLOAT) (sdata->pair.cut_global * sdata->pair.cut_global); - if(sdata->pair.cut) - { - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=i; j<=sdata->atom.ntypes; ++j) - { - if(sdata->pair.cut[i][j]>1e-6) - { - cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); - cutsq[j * cuda_ntypes + i] = (X_FLOAT) (sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); - } + //check if enough constant memory is available + if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params) + printf("# CUDA: Cuda_Pair_Init: you need %u types. this is more than %u " + "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 " + "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE - 1); - if(i==1&&j==1) cutsq_global = cutsq[i * cuda_ntypes + j]; - if((cutsq_global - cutsq[i * cuda_ntypes + j])*(cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6) - cutsqdiffer++; - } - } - } + if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params) + exit(0); - if(sdata->pair.cutsq) - { - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=i; j<=sdata->atom.ntypes; ++j) - { - if(sdata->pair.cut[i][j]>1e-6) - { - cutsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cutsq[i][j]); - cutsq[j * cuda_ntypes + i] = (X_FLOAT) (sdata->pair.cutsq[i][j]); - } + //type conversion of cutoffs and parameters + if(need_cut) { + X_FLOAT cutsq[cuda_ntypes2]; - if(i==1&&j==1) cutsq_global = cutsq[i * cuda_ntypes + j]; - if((cutsq_global - cutsq[i * cuda_ntypes + j])*(cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6) - cutsqdiffer++; - } - } - } - //printf("CUTSQGLOB: %i %e\n",cutsqdiffer,cutsq_global); - if(cutsqdiffer) - { - - cutsq_global = -1.0; - cudaMemcpyToSymbol(MY_CONST(cutsq) , cutsq , nx ); - } - cudaMemcpyToSymbol(MY_CONST(cutsq_global) ,&cutsq_global , sizeof(X_FLOAT) ); - } - - if(need_innercut) - { - X_FLOAT cut_innersq[cuda_ntypes2]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - cut_innersq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); - } - } - - int cutsqdiffer=0; - X_FLOAT cut_innersq_global; - cut_innersq_global = (X_FLOAT) (sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); - if(sdata->pair.cut_inner) - { - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=i; j<=sdata->atom.ntypes; ++j) - { - if(sdata->pair.cut_inner[i][j]>1e-6) - { - cut_innersq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); - cut_innersq[j * cuda_ntypes + i] = (X_FLOAT) (sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); - } - if(i==1&&j==1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j]; - if((cut_innersq_global - cut_innersq[i * cuda_ntypes + j])*(cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) > 1e-6) - cutsqdiffer++; - } - } - } - if(cutsqdiffer) - { - cut_innersq_global = -1.0; - cudaMemcpyToSymbol(MY_CONST(cut_innersq) , cut_innersq , nx ); - } - cudaMemcpyToSymbol(MY_CONST(cut_innersq_global) ,&cut_innersq_global , sizeof(X_FLOAT) ); + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global); + } } - - if(need_q) - { - X_FLOAT cut_coulsq[cuda_ntypes2]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); - } - } - - int cutsqdiffer=0; - X_FLOAT cut_coulsq_global; - cut_coulsq_global = (X_FLOAT) (sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); - if(sdata->pair.cut_coulsq_global> cut_coulsq_global) cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global; - if(sdata->pair.cut_coul) - { - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=i; j<=sdata->atom.ntypes; ++j) - { - if(sdata->pair.cut_coul[i][j]>1e-6) - { - cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT) (sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); - cut_coulsq[j * cuda_ntypes + i] = (X_FLOAT) (sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); - } - if(i==1&&j==1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j]; - if((cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j])*(cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) > 1e-6) - cutsqdiffer++; - } - } - } - if(cutsqdiffer) - { - cut_coulsq_global = -1.0; - cudaMemcpyToSymbol(MY_CONST(cut_coulsq) , cut_coulsq , nx ); - } - cudaMemcpyToSymbol(MY_CONST(cut_coulsq_global),&cut_coulsq_global , sizeof(X_FLOAT) ); + + int cutsqdiffer = 0; + X_FLOAT cutsq_global; + cutsq_global = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global); + + if(sdata->pair.cut) { + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = i; j <= sdata->atom.ntypes; ++j) { + if(sdata->pair.cut[i][j] > 1e-6) { + cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); + cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); + } + + if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j]; + + if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6) + cutsqdiffer++; + } + } } - CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed"); - if(ncoeff>0) - { - F_FLOAT coeff1[cuda_ntypes2]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j]; - } - } + if(sdata->pair.cutsq) { + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = i; j <= sdata->atom.ntypes; ++j) { + if(sdata->pair.cut[i][j] > 1e-6) { + cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cutsq[i][j]); + cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cutsq[i][j]); + } - if(use_global_params) - { - cudaMemcpyToSymbol(MY_CONST(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_FLOAT*) ); - cudaMemcpy((sdata->pair.coeff1_gm.dev_data),coeff1, n,cudaMemcpyHostToDevice); - - _coeff1_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff1_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff1_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff1_gm_texture_ptr; - cudaGetTextureReference(&coeff1_gm_texture_ptr, MY_CONST(coeff1_gm_tex)); - CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 a failed"); + if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j]; - #if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed"); - cudaBindTexture(0,coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); - CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed"); - #else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b-d failed"); - cudaBindTexture(0,coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); - CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c-d failed"); - #endif - - } - else - cudaMemcpyToSymbol(MY_AP(coeff1), coeff1 , n); - } - CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed"); + if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6) + cutsqdiffer++; + } + } + } - if(ncoeff>1) - { - F_FLOAT coeff2[cuda_ntypes2]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j]; - } - } + //printf("CUTSQGLOB: %i %e\n",cutsqdiffer,cutsq_global); + if(cutsqdiffer) { - if(use_global_params) - { - cudaMemcpyToSymbol(MY_CONST(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_FLOAT*) ); - cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n,cudaMemcpyHostToDevice); + cutsq_global = -1.0; + cudaMemcpyToSymbol(MY_AP(cutsq) , cutsq , nx); + } - _coeff2_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff2_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff2_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff2_gm_texture_ptr; - cudaGetTextureReference(&coeff2_gm_texture_ptr, MY_CONST(coeff2_gm_tex)); + cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT)); + } - #if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); - #else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); - #endif + if(need_innercut) { + X_FLOAT cut_innersq[cuda_ntypes2]; - } - else - cudaMemcpyToSymbol(MY_AP(coeff2), coeff2 , n); - } - CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed"); - - if(ncoeff>2) - { - F_FLOAT coeff3[cuda_ntypes2]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j]; - } - } + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); + } + } - if(use_global_params) - { - cudaMemcpyToSymbol(MY_CONST(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_FLOAT*) ); - cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n,cudaMemcpyHostToDevice); - _coeff3_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff3_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff3_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff3_gm_texture_ptr; - cudaGetTextureReference(&coeff3_gm_texture_ptr, MY_CONST(coeff3_gm_tex)); + int cutsqdiffer = 0; + X_FLOAT cut_innersq_global; + cut_innersq_global = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); - #if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); - #else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); - #endif - } - else - cudaMemcpyToSymbol(MY_AP(coeff3), coeff3 , n); - } - CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed"); - - if(ncoeff>3) - { - F_FLOAT coeff4[cuda_ntypes2]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j]; - } - } - - if(use_global_params) - { - cudaMemcpyToSymbol(MY_CONST(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_FLOAT*) ); - cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n,cudaMemcpyHostToDevice); - _coeff4_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff4_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff4_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff4_gm_texture_ptr; - cudaGetTextureReference(&coeff4_gm_texture_ptr, MY_CONST(coeff4_gm_tex)); + if(sdata->pair.cut_inner) { + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = i; j <= sdata->atom.ntypes; ++j) { + if(sdata->pair.cut_inner[i][j] > 1e-6) { + cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); + cut_innersq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); + } - #if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); - #else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); - #endif - } - else - cudaMemcpyToSymbol(MY_AP(coeff4), coeff4 , n); - } - CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed"); + if(i == 1 && j == 1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j]; - if(ncoeff>4) - { - F_FLOAT coeff5[cuda_ntypes2]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j]; - } - } - - if(use_global_params) - { - cudaMemcpyToSymbol(MY_CONST(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_FLOAT*) ); - cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n,cudaMemcpyHostToDevice); - _coeff5_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff5_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff5_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff5_gm_texture_ptr; - cudaGetTextureReference(&coeff5_gm_texture_ptr, MY_CONST(coeff5_gm_tex)); + if((cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) * (cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) > 1e-6) + cutsqdiffer++; + } + } + } - #if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); - #else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); - #endif - } - else - cudaMemcpyToSymbol(MY_AP(coeff5), coeff5 , n); - } + if(cutsqdiffer) { + cut_innersq_global = -1.0; + cudaMemcpyToSymbol(MY_AP(cut_innersq) , cut_innersq , nx); + } - CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed"); - if(ncoeff>5) - { - F_FLOAT coeff6[cuda_ntypes2]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j]; - } - } - - if(use_global_params) - { - cudaMemcpyToSymbol(MY_CONST(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_FLOAT*) ); - cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n,cudaMemcpyHostToDevice); - _coeff6_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff6_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff6_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff6_gm_texture_ptr; - cudaGetTextureReference(&coeff6_gm_texture_ptr, MY_CONST(coeff6_gm_tex)); + cudaMemcpyToSymbol(MY_AP(cut_innersq_global) , &cut_innersq_global , sizeof(X_FLOAT)); + } - #if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); - #else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); - #endif - } - } - CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed"); + if(need_q) { + X_FLOAT cut_coulsq[cuda_ntypes2]; - if(ncoeff>6) - { - F_FLOAT coeff7[cuda_ntypes2]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j]; - } - } - - if(use_global_params) - { - cudaMemcpyToSymbol(MY_CONST(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_FLOAT*) ); - cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n,cudaMemcpyHostToDevice); - _coeff7_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff7_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff7_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff7_gm_texture_ptr; - cudaGetTextureReference(&coeff7_gm_texture_ptr, MY_CONST(coeff7_gm_tex)); + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); + } + } - #if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); - #else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); - #endif - } - } - CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed"); + int cutsqdiffer = 0; + X_FLOAT cut_coulsq_global; + cut_coulsq_global = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); - if(ncoeff>7) - { - F_FLOAT coeff8[cuda_ntypes2]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j]; - } - } - - if(use_global_params) - { - cudaMemcpyToSymbol(MY_CONST(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_FLOAT*) ); - cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n,cudaMemcpyHostToDevice); - _coeff8_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff8_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff8_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff8_gm_texture_ptr; - cudaGetTextureReference(&coeff8_gm_texture_ptr, MY_CONST(coeff8_gm_tex)); + if(sdata->pair.cut_coulsq_global > cut_coulsq_global) cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global; - #if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); - #else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); - #endif - } - } - CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed"); + if(sdata->pair.cut_coul) { + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = i; j <= sdata->atom.ntypes; ++j) { + if(sdata->pair.cut_coul[i][j] > 1e-6) { + cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); + cut_coulsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); + } - if(ncoeff>8) - { - F_FLOAT coeff9[cuda_ntypes2]; - for(int i=1; i<=sdata->atom.ntypes; ++i) - { - for(int j=1; j<=sdata->atom.ntypes; ++j) - { - coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j]; - } - } - - if(use_global_params) - { - cudaMemcpyToSymbol(MY_CONST(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_FLOAT*) ); - cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n,cudaMemcpyHostToDevice); - _coeff9_gm_tex.normalized = false; // access with normalized texture coordinates - _coeff9_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no - _coeff9_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* coeff9_gm_texture_ptr; - cudaGetTextureReference(&coeff9_gm_texture_ptr, MY_CONST(coeff9_gm_tex)); + if(i == 1 && j == 1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j]; - #if F_PRECISION == 1 - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(F_FLOAT)); - #else - cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int2)); - #endif - } - } - CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed"); - - F_FLOAT special_lj[4]; - special_lj[0]=sdata->pair.special_lj[0]; - special_lj[1]=sdata->pair.special_lj[1]; - special_lj[2]=sdata->pair.special_lj[2]; - special_lj[3]=sdata->pair.special_lj[3]; + if((cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) * (cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) > 1e-6) + cutsqdiffer++; + } + } + } + + if(cutsqdiffer) { + cut_coulsq_global = -1.0; + cudaMemcpyToSymbol(MY_AP(cut_coulsq) , cut_coulsq , nx); + } + + cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global , sizeof(X_FLOAT)); + } + + CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed"); + + if(ncoeff > 0) { + F_FLOAT coeff1[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy((sdata->pair.coeff1_gm.dev_data), coeff1, n, cudaMemcpyHostToDevice); + + _coeff1_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff1_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff1_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff1_gm_texture_ptr = &MY_AP(coeff1_gm_tex); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 a failed"); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed"); + cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed"); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b-d failed"); + cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c-d failed"); +#endif + + } else + cudaMemcpyToSymbol(MY_AP(coeff1), coeff1 , n); + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed"); + + if(ncoeff > 1) { + F_FLOAT coeff2[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n, cudaMemcpyHostToDevice); + + _coeff2_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff2_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff2_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff2_gm_texture_ptr = &MY_AP(coeff2_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + + } else + cudaMemcpyToSymbol(MY_AP(coeff2), coeff2 , n); + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed"); + + if(ncoeff > 2) { + F_FLOAT coeff3[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n, cudaMemcpyHostToDevice); + _coeff3_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff3_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff3_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff3_gm_texture_ptr = &MY_AP(coeff3_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } else + cudaMemcpyToSymbol(MY_AP(coeff3), coeff3 , n); + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed"); + + if(ncoeff > 3) { + F_FLOAT coeff4[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n, cudaMemcpyHostToDevice); + _coeff4_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff4_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff4_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff4_gm_texture_ptr = &MY_AP(coeff4_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } else + cudaMemcpyToSymbol(MY_AP(coeff4), coeff4 , n); + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed"); + + if(ncoeff > 4) { + F_FLOAT coeff5[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n, cudaMemcpyHostToDevice); + _coeff5_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff5_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff5_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff5_gm_texture_ptr = &MY_AP(coeff5_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } else + cudaMemcpyToSymbol(MY_AP(coeff5), coeff5 , n); + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed"); + + if(ncoeff > 5) { + F_FLOAT coeff6[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n, cudaMemcpyHostToDevice); + _coeff6_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff6_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff6_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff6_gm_texture_ptr = &MY_AP(coeff6_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed"); + + if(ncoeff > 6) { + F_FLOAT coeff7[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n, cudaMemcpyHostToDevice); + _coeff7_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff7_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff7_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff7_gm_texture_ptr = &MY_AP(coeff7_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed"); + + if(ncoeff > 7) { + F_FLOAT coeff8[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n, cudaMemcpyHostToDevice); + _coeff8_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff8_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff8_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff8_gm_texture_ptr = &MY_AP(coeff8_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed"); + + if(ncoeff > 8) { + F_FLOAT coeff9[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n, cudaMemcpyHostToDevice); + _coeff9_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff9_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff9_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff9_gm_texture_ptr = &MY_AP(coeff9_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); + cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed"); + + F_FLOAT special_lj[4]; + special_lj[0] = sdata->pair.special_lj[0]; + special_lj[1] = sdata->pair.special_lj[1]; + special_lj[2] = sdata->pair.special_lj[2]; + special_lj[3] = sdata->pair.special_lj[3]; - X_FLOAT box_size[3] = - { - sdata->domain.subhi[0] - sdata->domain.sublo[0], - sdata->domain.subhi[1] - sdata->domain.sublo[1], - sdata->domain.subhi[2] - sdata->domain.sublo[2] - }; - - cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3); - cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(special_lj) , special_lj , sizeof(F_FLOAT)*4); - cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 ); - cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) ); - - if(need_q) - { - F_FLOAT qqrd2e_tmp=sdata->pppm.qqrd2e; - F_FLOAT special_coul[4]; - special_coul[0]=sdata->pair.special_coul[0]; - special_coul[1]=sdata->pair.special_coul[1]; - special_coul[2]=sdata->pair.special_coul[2]; - special_coul[3]=sdata->pair.special_coul[3]; - - cudaMemcpyToSymbol(MY_CONST(special_coul) , special_coul , sizeof(F_FLOAT)*4); - cudaMemcpyToSymbol(MY_CONST(g_ewald) ,&sdata->pair.g_ewald , sizeof(F_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(qqrd2e) ,&qqrd2e_tmp , sizeof(F_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(kappa) ,&sdata->pair.kappa , sizeof(F_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(eng_coul) ,&sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*) ); - } - CUT_CHECK_ERROR("Cuda_Pair: init failed"); + X_FLOAT box_size[3] = { + sdata->domain.subhi[0] - sdata->domain.sublo[0], + sdata->domain.subhi[1] - sdata->domain.sublo[1], + sdata->domain.subhi[2] - sdata->domain.sublo[2] + }; + + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(special_lj) , special_lj , sizeof(F_FLOAT) * 4); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3); + cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); + + if(need_q) { + F_FLOAT qqrd2e_tmp = sdata->pppm.qqrd2e; + F_FLOAT special_coul[4]; + special_coul[0] = sdata->pair.special_coul[0]; + special_coul[1] = sdata->pair.special_coul[1]; + special_coul[2] = sdata->pair.special_coul[2]; + special_coul[3] = sdata->pair.special_coul[3]; + + cudaMemcpyToSymbol(MY_AP(special_coul) , special_coul , sizeof(F_FLOAT) * 4); + cudaMemcpyToSymbol(MY_AP(g_ewald) , &sdata->pair.g_ewald , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(qqrd2e) , &qqrd2e_tmp , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(kappa) , &sdata->pair.kappa , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(eng_coul) , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*)); + } + + CUT_CHECK_ERROR("Cuda_Pair: init failed"); } timespec startpairtime, endpairtime; //Function which is called prior to kernel invocation, determins grid, Binds Textures, updates constant memory if necessary -void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,int eflag, int vflag, dim3& grid, dim3& threads, int& sharedperproc,bool need_q=false,int maxthreads=256) +void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, dim3 &grid, dim3 &threads, int &sharedperproc, bool need_q = false, int maxthreads = 256) { - if(sdata->atom.nlocal==0) return; - if(sdata->atom.update_neigh) - Cuda_Pair_UpdateNeighbor_AllStyles(sdata,sneighlist); - if(sdata->atom.update_nmax) - Cuda_Pair_UpdateNmax_AllStyles(sdata,sneighlist); - if(sdata->atom.update_nlocal) - { - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); - } + if(sdata->atom.nlocal == 0) return; + + if(sdata->atom.update_neigh) + Cuda_Pair_UpdateNeighbor_AllStyles(sdata, sneighlist); + + if(sdata->atom.update_nmax) + Cuda_Pair_UpdateNmax_AllStyles(sdata, sneighlist); + + if(sdata->atom.update_nlocal) { + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + } - BindXTypeTexture(sdata); - if(need_q) BindQTexture(sdata); - + BindXTypeTexture(sdata); - sharedperproc=0; - if(sdata->pair.use_block_per_atom) sharedperproc+=3; - if(eflag) sharedperproc+=1; - if(need_q && eflag) sharedperproc+=1; - if(vflag) sharedperproc+=6; + if(need_q) BindQTexture(sdata); - int threadnum = sneighlist->inum; - if (sdata->comm.comm_phase==2)threadnum=sneighlist->inum_border2; - if(sdata->pair.use_block_per_atom) {threadnum*=64; maxthreads=64;} - int3 layout=getgrid(threadnum,sharedperproc*sizeof(ENERGY_FLOAT),maxthreads,true); //need to limit to 192 threads due to register limit - threads.x = layout.z; threads.y = 1; threads.z = 1; - grid.x = layout.x; grid.y = layout.y; grid.z = 1; - - int size=(unsigned)(layout.y*layout.x)*sharedperproc*sizeof(ENERGY_FLOAT); - if(sdata->pair.collect_forces_later) size+=(unsigned)(sdata->atom.nmax*3*sizeof(F_FLOAT)); - Cuda_UpdateBuffer(sdata,size); - - if(sdata->pair.use_block_per_atom) - cudaMemset(sdata->buffer, 0, size); - sdata->pair.lastgridsize=grid.x*grid.y; - sdata->pair.n_energy_virial=sharedperproc; - if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial-=3; + sharedperproc = 0; - clock_gettime(CLOCK_REALTIME,&startpairtime); + if(sdata->pair.use_block_per_atom) sharedperproc += 3; - MYDBG( printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n",eflag,vflag,grid.x,grid.y, threads.x,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x); ) + if(eflag) sharedperproc += 1; + + if(need_q && eflag) sharedperproc += 1; + + if(vflag) sharedperproc += 6; + + int threadnum = sneighlist->inum; + + if(sdata->comm.comm_phase == 2)threadnum = sneighlist->inum_border2; + + if(sdata->pair.use_block_per_atom) { + threadnum *= 64; + maxthreads = 64; + } + + int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_FLOAT), maxthreads, true); //need to limit to 192 threads due to register limit + threads.x = layout.z; + threads.y = 1; + threads.z = 1; + grid.x = layout.x; + grid.y = layout.y; + grid.z = 1; + + int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_FLOAT); + + if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_FLOAT)); + + Cuda_UpdateBuffer(sdata, size); + + if(sdata->pair.use_block_per_atom) + cudaMemset(sdata->buffer, 0, size); + + sdata->pair.lastgridsize = grid.x * grid.y; + sdata->pair.n_energy_virial = sharedperproc; + + if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial -= 3; + + clock_gettime(CLOCK_REALTIME, &startpairtime); + + MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);) } //Function which is called after the kernel invocation, collects energy and virial -void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3& grid, int& sharedperproc,int eflag, int vflag) +void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sharedperproc, int eflag, int vflag) { - if((not sdata->pair.collect_forces_later) && (eflag||vflag))//not sdata->comm.comm_phase==2)) - { - cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME,&endpairtime); - sdata->cuda_timings.pair_kernel+= - endpairtime.tv_sec-startpairtime.tv_sec+1.0*(endpairtime.tv_nsec-startpairtime.tv_nsec)/1000000000; - CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed"); + if((not sdata->pair.collect_forces_later) && (eflag || vflag)) { //not sdata->comm.comm_phase==2)) + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME, &endpairtime); + sdata->cuda_timings.pair_kernel += + endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000; + CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed"); - if(eflag||vflag) - { - int n=grid.x*grid.y; - if(sdata->pair.use_block_per_atom) - grid.x=sharedperproc-3; - else - grid.x=sharedperproc; - grid.y=1; - dim3 threads(128,1,1); - MYDBG( printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n",eflag,vflag,grid.x,grid.y, threads.x,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x); ) - MY_AP(PairVirialCompute_reduce)<<>>(n); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed"); - } + if(eflag || vflag) { + int n = grid.x * grid.y; - MYDBG( printf("# CUDA: Cuda_Pair: kernel done\n"); ) - } + if(sdata->pair.use_block_per_atom) + grid.x = sharedperproc - 3; + else + grid.x = sharedperproc; + + grid.y = 1; + dim3 threads(128, 1, 1); + MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);) + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed"); + } + + MYDBG(printf("# CUDA: Cuda_Pair: kernel done\n");) + } } - + #include "pair_born_coul_long_cuda.cu" #include "pair_buck_coul_cut_cuda.cu" @@ -877,149 +858,158 @@ void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3& grid, int& sh void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata) { - CUT_CHECK_ERROR("Cuda_Pair: before updateNmax failed"); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) ); - cudaMemcpyToSymbol(MY_CONST(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(radius) , & sdata->atom.radius .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*) ); - cudaMemcpyToSymbol(MY_CONST(omega) , & sdata->atom.omega .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(omega_rmass),& sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*) ); - cudaMemcpyToSymbol(MY_CONST(map_array), & sdata->atom.map_array .dev_data, sizeof(int*) ); - CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed"); + CUT_CHECK_ERROR("Cuda_Pair: before updateNmax failed"); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*)); + CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed"); } void Cuda_Pair_GenerateXType(cuda_shared_data* sdata) { - MYDBG(printf(" # CUDA: GenerateXType ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); ) - if(sdata->atom.update_nmax) - Cuda_Pair_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - { - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); - } - MYDBG(printf(" # CUDA: GenerateXType ... getgrid\n"); fflush(stdout); ) + MYDBG(printf(" # CUDA: GenerateXType ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);) - int3 layout=getgrid(sdata->atom.nall); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - MYDBG(printf(" # CUDA: GenerateXType ... kernel start test\n"); fflush(stdout);) - Pair_GenerateXType_Kernel<<>>(); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); - MYDBG(printf(" # CUDA: GenerateXType ... end\n"); fflush(stdout); ) + if(sdata->atom.update_nmax) + Cuda_Pair_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) { + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + } + + MYDBG(printf(" # CUDA: GenerateXType ... getgrid\n"); fflush(stdout);) + + int3 layout = getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + MYDBG(printf(" # CUDA: GenerateXType ... kernel start test\n"); fflush(stdout);) + Pair_GenerateXType_Kernel <<< grid, threads, 0>>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); + MYDBG(printf(" # CUDA: GenerateXType ... end\n"); fflush(stdout);) } void Cuda_Pair_RevertXType(cuda_shared_data* sdata) { - MYDBG(printf(" # CUDA: RevertXType ... start\n"); ) - if(sdata->atom.update_nmax) - Cuda_Pair_UpdateNmax(sdata); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + MYDBG(printf(" # CUDA: RevertXType ... start\n");) - int3 layout=getgrid(sdata->atom.nall); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Pair_RevertXType_Kernel<<>>(); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); - MYDBG(printf(" # CUDA: RevertXType ... end\n"); ) + if(sdata->atom.update_nmax) + Cuda_Pair_UpdateNmax(sdata); + + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + + int3 layout = getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Pair_RevertXType_Kernel <<< grid, threads, 0>>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); + MYDBG(printf(" # CUDA: RevertXType ... end\n");) } void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata) { - MYDBG(printf(" # CUDA: GenerateVRadius ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); ) - if(sdata->atom.update_nmax) - Cuda_Pair_UpdateNmax(sdata); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); - MYDBG(printf(" # CUDA: GenerateVRadius ... getgrid\n"); fflush(stdout); ) + MYDBG(printf(" # CUDA: GenerateVRadius ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);) - int3 layout=getgrid(sdata->atom.nall); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - MYDBG(printf(" # CUDA: GenerateVRadius ... kernel start test\n"); fflush(stdout);) - Pair_GenerateVRadius_Kernel<<>>(); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair GenerateVRadius: Kernel failed"); - MYDBG(printf(" # CUDA: GenerateVRadius ... end\n"); fflush(stdout); ) + if(sdata->atom.update_nmax) + Cuda_Pair_UpdateNmax(sdata); + + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + MYDBG(printf(" # CUDA: GenerateVRadius ... getgrid\n"); fflush(stdout);) + + int3 layout = getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + MYDBG(printf(" # CUDA: GenerateVRadius ... kernel start test\n"); fflush(stdout);) + Pair_GenerateVRadius_Kernel <<< grid, threads, 0>>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateVRadius: Kernel failed"); + MYDBG(printf(" # CUDA: GenerateVRadius ... end\n"); fflush(stdout);) } void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata) { - MYDBG(printf(" # CUDA: GenerateOmegaRmass ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); ) - if(sdata->atom.update_nmax) - Cuda_Pair_UpdateNmax(sdata); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); - MYDBG(printf(" # CUDA: GenerateOmegaRmass ... getgrid\n"); fflush(stdout); ) + MYDBG(printf(" # CUDA: GenerateOmegaRmass ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);) - int3 layout=getgrid(sdata->atom.nall); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - MYDBG(printf(" # CUDA: GenerateOmegaRmass ... kernel start test\n"); fflush(stdout);) - Pair_GenerateOmegaRmass_Kernel<<>>(); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair GenerateOmegaRmass: Kernel failed"); - MYDBG(printf(" # CUDA: GenerateOmegaRmass ... end\n"); fflush(stdout); ) + if(sdata->atom.update_nmax) + Cuda_Pair_UpdateNmax(sdata); + + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + MYDBG(printf(" # CUDA: GenerateOmegaRmass ... getgrid\n"); fflush(stdout);) + + int3 layout = getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + MYDBG(printf(" # CUDA: GenerateOmegaRmass ... kernel start test\n"); fflush(stdout);) + Pair_GenerateOmegaRmass_Kernel <<< grid, threads, 0>>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateOmegaRmass: Kernel failed"); + MYDBG(printf(" # CUDA: GenerateOmegaRmass ... end\n"); fflush(stdout);) } void Cuda_Pair_BuildXHold(cuda_shared_data* sdata) { if(sdata->atom.update_nmax) - Cuda_Pair_UpdateNmax(sdata); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + Cuda_Pair_UpdateNmax(sdata); - int3 layout=getgrid(sdata->atom.nall); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - Pair_BuildXHold_Kernel<<>>(); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + + int3 layout = getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Pair_BuildXHold_Kernel <<< grid, threads, 0>>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); } -void Cuda_Pair_CollectForces(cuda_shared_data* sdata,int eflag, int vflag) +void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag) { - cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME,&endpairtime); - sdata->cuda_timings.pair_kernel+= - endpairtime.tv_sec-startpairtime.tv_sec+1.0*(endpairtime.tv_nsec-startpairtime.tv_nsec)/1000000000; + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME, &endpairtime); + sdata->cuda_timings.pair_kernel += + endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000; CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed"); - dim3 threads; - dim3 grid; - - if(eflag||vflag) - { - int n=sdata->pair.lastgridsize; - grid.x=sdata->pair.n_energy_virial; - grid.y=1; - threads.x=128; - //printf("A grid.x: %i\n",grid.x); - MY_AP(PairVirialCompute_reduce)<<>>(n); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed"); - } - int3 layout=getgrid(sdata->atom.nlocal); - threads.x = layout.z; - grid.x = layout.x; - grid.y = layout.y; - Pair_CollectForces_Kernel<<>>(sdata->pair.n_energy_virial,sdata->pair.lastgridsize); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_Pair_CollectForces: Force Summation Kernel execution failed"); - + dim3 threads; + dim3 grid; + + if(eflag || vflag) { + int n = sdata->pair.lastgridsize; + grid.x = sdata->pair.n_energy_virial; + grid.y = 1; + threads.x = 128; + //printf("A grid.x: %i\n",grid.x); + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed"); + } + + int3 layout = getgrid(sdata->atom.nlocal); + threads.x = layout.z; + grid.x = layout.x; + grid.y = layout.y; + Pair_CollectForces_Kernel <<< grid, threads, 0>>>(sdata->pair.n_energy_virial, sdata->pair.lastgridsize); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair_CollectForces: Force Summation Kernel execution failed"); + } diff --git a/lib/cuda/pair_eam_cuda.cu b/lib/cuda/pair_eam_cuda.cu index d97143a0c7..cb20343770 100644 --- a/lib/cuda/pair_eam_cuda.cu +++ b/lib/cuda/pair_eam_cuda.cu @@ -1,22 +1,22 @@ /* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + Steve Plimpton, sjplimp@sandia.gov - See the README file in the top-level LAMMPS directory. + See the README file in the top-level LAMMPS directory. - ----------------------------------------------------------------------- + ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ + https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany + Theoretical Physics II, University of Technology Ilmenau, Germany - See the README file in the USER-CUDA directory. + See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ @@ -54,17 +54,17 @@ __device__ __constant__ F_FLOAT* MY_AP(fp); #define _rhor_spline_tex MY_AP(rhor_spline_tex) #if F_PRECISION == 1 -texture _rhor_spline_tex; +texture _rhor_spline_tex; #else -texture _rhor_spline_tex; +texture _rhor_spline_tex; #endif #define _z2r_spline_tex MY_AP(z2r_spline_tex) #if F_PRECISION == 1 -texture _z2r_spline_tex; +texture _z2r_spline_tex; #else -texture _z2r_spline_tex; +texture _z2r_spline_tex; #endif @@ -85,243 +85,258 @@ inline void BindEAMTextures(cuda_shared_data* sdata) _rhor_spline_tex.normalized = false; // access with normalized texture coordinates _rhor_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no _rhor_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - - const textureReference* rhor_spline_texture_ptr; - cudaGetTextureReference(&rhor_spline_texture_ptr, MY_CONST(rhor_spline_tex)); - #if F_PRECISION == 1 + const textureReference* rhor_spline_texture_ptr = &MY_AP(rhor_spline_tex); + +#if F_PRECISION == 1 cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc(); - cudaBindTexture(0,rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size); - #else + cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size); +#else cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc(); - cudaBindTexture(0,rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size); - #endif - + cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size); +#endif + _z2r_spline_tex.normalized = false; // access with normalized texture coordinates _z2r_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no _z2r_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates - const textureReference* z2r_spline_texture_ptr; - cudaGetTextureReference(&z2r_spline_texture_ptr, MY_CONST(z2r_spline_tex)); + const textureReference* z2r_spline_texture_ptr = &MY_AP(z2r_spline_tex); - #if F_PRECISION == 1 +#if F_PRECISION == 1 cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc(); - cudaBindTexture(0,z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size); - #else + cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size); +#else cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc(); - cudaBindTexture(0,z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size); - #endif - + cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size); +#endif + } void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) { - CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed"); - int3 layout=getgrid(sneighlist->inum,7*sizeof(F_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - int size=(unsigned)(layout.y*layout.x)*7*sizeof(F_FLOAT); - if(sdata->buffersizebuffer,sdata->buffersize);) - if(sdata->buffer!=NULL) cudaFree(sdata->buffer); - cudaMalloc((void**)&sdata->buffer,size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - } - cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); - CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed"); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed"); + int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_FLOAT); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + + if(sdata->buffer != NULL) cudaFree(sdata->buffer); + + cudaMalloc((void**)&sdata->buffer, size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed"); } void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) { -cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); -cudaMemcpyToSymbol(MY_CONST(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) ); -cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); -cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); -cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); -cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); -cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); -cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) ); -cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); + cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int)); } void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) { - CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed"); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) ); - CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed"); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed"); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed"); } -void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata,double rdr,double rdrho,int nfrho, int nrhor,int nr, int nrho,int nz2r, -void* frho_spline,void* rhor_spline,void* z2r_spline,void* rho,void* fp, -int* type2frho,int** type2z2r,int** type2rhor) +void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, int nfrho, int nrhor, int nr, int nrho, int nz2r, + void* frho_spline, void* rhor_spline, void* z2r_spline, void* rho, void* fp, + int* type2frho, int** type2z2r, int** type2rhor) { - // !! LAMMPS indexes atom types starting with 1 !! - - unsigned cuda_ntypes = sdata->atom.ntypes + 1; - if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2) - printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u " - "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 " - "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2); - unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes; + // !! LAMMPS indexes atom types starting with 1 !! + + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + + if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2) + printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u " + "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 " + "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2); + + unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes; + + X_FLOAT cutsq_global; + cutsq_global = (X_FLOAT)(sdata->pair.cut_global); + cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT)); + + + F_FLOAT* coeff_buf = new F_FLOAT[cuda_ntypes * cuda_ntypes]; + + for(int i = 0; i < cuda_ntypes; i++) coeff_buf[i] = type2frho[i]; + + cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes * sizeof(F_FLOAT)); + + for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2rhor[0][0])[i]; + + cudaMemcpyToSymbol(MY_AP(coeff2) , coeff_buf , nI); + + for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2z2r[0][0])[i]; + + cudaMemcpyToSymbol(MY_AP(coeff3) , coeff_buf , nI); + + delete [] coeff_buf; + X_FLOAT box_size[3] = { + sdata->domain.subhi[0] - sdata->domain.sublo[0], + sdata->domain.subhi[1] - sdata->domain.sublo[1], + sdata->domain.subhi[2] - sdata->domain.sublo[2] + }; + F_FLOAT rdr_F = rdr; + F_FLOAT rdrho_F = rdrho; + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3); + cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(nr), &nr, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nrho), &nrho, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nfrho), &nfrho, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int)); + + rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT); + z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT); + rhor_spline_pointer = rhor_spline; + z2r_spline_pointer = z2r_spline; + + CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed"); - X_FLOAT cutsq_global; - cutsq_global = (X_FLOAT) (sdata->pair.cut_global); - cudaMemcpyToSymbol(MY_CONST(cutsq_global) ,&cutsq_global , sizeof(X_FLOAT) ); - - - F_FLOAT* coeff_buf=new F_FLOAT[cuda_ntypes*cuda_ntypes]; - for(int i=0;idomain.subhi[0] - sdata->domain.sublo[0], - sdata->domain.subhi[1] - sdata->domain.sublo[1], - sdata->domain.subhi[2] - sdata->domain.sublo[2] - }; - F_FLOAT rdr_F=rdr; - F_FLOAT rdrho_F=rdrho; - cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3); - cudaMemcpyToSymbol(MY_CONST(cuda_ntypes), & cuda_ntypes , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity, sizeof(int)*3 ); - cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(rdr), &rdr_F, sizeof(F_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(rdrho), &rdrho_F, sizeof(F_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(nr), &nr, sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nrho), &nrho, sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nfrho), &nfrho, sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(rho), &rho, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(fp), &fp, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(frho_spline), &frho_spline, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(rhor_spline), &rhor_spline, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(z2r_spline), &z2r_spline, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int) ); - - rhor_spline_size = nrhor*(nr+1)*EAM_COEFF_LENGTH*sizeof(F_FLOAT); - z2r_spline_size = nz2r*(nr+1)*EAM_COEFF_LENGTH*sizeof(F_FLOAT); - rhor_spline_pointer = rhor_spline; - z2r_spline_pointer = z2r_spline; - - CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed"); - } void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) { - - if(sdata->atom.update_nmax) - Cuda_PairEAMCuda_UpdateNmax(sdata,sneighlist); + + if(sdata->atom.update_nmax) + Cuda_PairEAMCuda_UpdateNmax(sdata, sneighlist); + if(sdata->atom.update_neigh) - Cuda_PairEAMCuda_UpdateNeighbor(sdata,sneighlist); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - if(sdata->buffer_new) - Cuda_PairEAMCuda_UpdateBuffer(sdata,sneighlist); - cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) ); + Cuda_PairEAMCuda_UpdateNeighbor(sdata, sneighlist); - int sharedperproc=0; - if(eflag||eflag_atom) sharedperproc=1; - if(vflag||vflag_atom) sharedperproc=7; + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - eam_buff_offset=grid.x*grid.y; - - BindXTypeTexture(sdata); - BindEAMTextures( sdata);// initialize only on first call + if(sdata->buffer_new) + Cuda_PairEAMCuda_UpdateBuffer(sdata, sneighlist); - - MYDBG( printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n",eflag,vflag); ) - CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation"); - PairEAMCuda_Kernel1<<>> (eflag, vflag,eflag_atom,vflag_atom); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed"); - + cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); + + int sharedperproc = 0; + + if(eflag || eflag_atom) sharedperproc = 1; + + if(vflag || vflag_atom) sharedperproc = 7; + + int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + eam_buff_offset = grid.x * grid.y; + + BindXTypeTexture(sdata); + BindEAMTextures(sdata); // initialize only on first call - MYDBG( printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n"); ) - + MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);) + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation"); + PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed"); + + + + MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");) + } void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) { - int sharedperproc=0; - if(eflag||eflag_atom) sharedperproc=1; - if(vflag||vflag_atom) sharedperproc=7; - int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT)); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - BindXTypeTexture(sdata); - BindEAMTextures( sdata);// initialize only on first call - // initialize only on first call - sdata->pair.lastgridsize=grid.x*grid.y; - sdata->pair.n_energy_virial=sharedperproc; + int sharedperproc = 0; - MYDBG( printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n",eflag,vflag); ) - CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation"); - PairEAMCuda_Kernel2<<>> (eflag, vflag,eflag_atom,vflag_atom); - CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed"); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed"); - - if(eflag||vflag) - { - int n=grid.x*grid.y; - grid.x=sharedperproc; - grid.y=1; - threads.x=256; - MY_AP(PairVirialCompute_reduce)<<>>(n); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed"); - } + if(eflag || eflag_atom) sharedperproc = 1; + + if(vflag || vflag_atom) sharedperproc = 7; + + int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + BindXTypeTexture(sdata); + BindEAMTextures(sdata); // initialize only on first call + // initialize only on first call + sdata->pair.lastgridsize = grid.x * grid.y; + sdata->pair.n_energy_virial = sharedperproc; + + MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);) + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation"); + PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed"); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed"); + + if(eflag || vflag) { + int n = grid.x * grid.y; + grid.x = sharedperproc; + grid.y = 1; + threads.x = 256; + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)*sharedperproc>>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed"); + } + + MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");) - MYDBG( printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n"); ) - } -void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send) +void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send) { - int3 layout=getgrid(n,0); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - F_FLOAT* buf=(F_FLOAT*) (& ((double*)sdata->buffer)[eam_buff_offset]); - - PairEAMCuda_PackComm_Kernel<<>> ((int*) sdata->comm.sendlist.dev_data,n - ,sdata->comm.maxlistlength,iswap,buf); - cudaThreadSynchronize(); - cudaMemcpy(buf_send, buf, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); - cudaThreadSynchronize(); + int3 layout = getgrid(n, 0); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + F_FLOAT* buf = (F_FLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]); + + PairEAMCuda_PackComm_Kernel <<< grid, threads, 0>>> ((int*) sdata->comm.sendlist.dev_data, n + , sdata->comm.maxlistlength, iswap, buf); + cudaThreadSynchronize(); + cudaMemcpy(buf_send, buf, n* sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + cudaThreadSynchronize(); } -void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,void* fp) +void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, void* fp) { - F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]); - cudaMemcpy(fp_first,buf_recv, n*sizeof(F_FLOAT), cudaMemcpyHostToDevice); + F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]); + cudaMemcpy(fp_first, buf_recv, n * sizeof(F_FLOAT), cudaMemcpyHostToDevice); } #undef _type2frho diff --git a/lib/cuda/pair_sw_cuda.cu b/lib/cuda/pair_sw_cuda.cu index f5b0807fcd..491d4d666f 100644 --- a/lib/cuda/pair_sw_cuda.cu +++ b/lib/cuda/pair_sw_cuda.cu @@ -1,22 +1,22 @@ /* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + Steve Plimpton, sjplimp@sandia.gov - See the README file in the top-level LAMMPS directory. + See the README file in the top-level LAMMPS directory. - ----------------------------------------------------------------------- + ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ + https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany + Theoretical Physics II, University of Technology Ilmenau, Germany - See the README file in the USER-CUDA directory. + See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ @@ -24,116 +24,115 @@ #include #include "pair_sw_cuda_cu.h" -__device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR*MANYBODY_NPAIR*MANYBODY_NPAIR]; +__device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR]; #include "pair_sw_cuda_kernel_nc.cu" #include -void Cuda_PairSWCuda_Init(cuda_shared_data* sdata,ParamSW_Float* params_host,void* map_host, void* elem2param_host,int nelements_h) +void Cuda_PairSWCuda_Init(cuda_shared_data* sdata, ParamSW_Float* params_host, void* map_host, void* elem2param_host, int nelements_h) { unsigned cuda_ntypes = sdata->atom.ntypes + 1; - X_FLOAT box_size[3] = - { + X_FLOAT box_size[3] = { sdata->domain.subhi[0] - sdata->domain.sublo[0], sdata->domain.subhi[1] - sdata->domain.sublo[1], sdata->domain.subhi[2] - sdata->domain.sublo[2] }; - cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3); - cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 ); - cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) ); - cudaMemcpyToSymbol("params_sw", params_host , sizeof(ParamSW_Float)*nelements_h*nelements_h*nelements_h ); - cudaMemcpyToSymbol("elem2param",elem2param_host , sizeof(int)*nelements_h*nelements_h*nelements_h ); - cudaMemcpyToSymbol("map",map_host , sizeof(int)*cuda_ntypes ); - cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3); + cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); + cudaMemcpyToSymbol(params_sw, params_host , sizeof(ParamSW_Float)*nelements_h * nelements_h * nelements_h); + cudaMemcpyToSymbol(elem2param, elem2param_host , sizeof(int)*nelements_h * nelements_h * nelements_h); + cudaMemcpyToSymbol(map, map_host , sizeof(int)*cuda_ntypes); + cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int)); } -void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) { - static int glob_ij_size=0; - static F_FLOAT4* glob_r_ij=NULL; - static int* glob_numneigh_red=NULL; - static int* glob_neighbors_red=NULL; - static int* glob_neightype_red=NULL; + static int glob_ij_size = 0; + static F_FLOAT4* glob_r_ij = NULL; + static int* glob_numneigh_red = NULL; + static int* glob_neighbors_red = NULL; + static int* glob_neightype_red = NULL; - if(glob_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT)) - { - glob_ij_size = sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT); + if(glob_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) { + glob_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT); cudaFree(glob_r_ij); cudaFree(glob_numneigh_red); cudaFree(glob_neighbors_red); cudaFree(glob_neightype_red); - cudaMalloc(&glob_r_ij,glob_ij_size*4); - cudaMalloc(&glob_numneigh_red,sdata->atom.nall*sizeof(int)); - cudaMalloc(&glob_neighbors_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int)); - cudaMalloc(&glob_neightype_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int)); - cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red , sizeof(int*) ); - cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red , sizeof(int*) ); - cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red , sizeof(int*) ); - cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij , sizeof(F_FLOAT4*) ); - } - dim3 grid,threads; - int sharedperproc; + cudaMalloc(&glob_r_ij, glob_ij_size * 4); + cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int)); + cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int)); + cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int)); + cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*)); + cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*)); + cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*)); + cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*)); + } - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,64); - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64); + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - dim3 grid2; - if(sdata->atom.nall<=256*64000){ - grid2.x = (sdata->atom.nall+255)/256; - grid2.y = 1; - } else { - grid2.x = (sdata->atom.nall+256*128-1)/(256*128); + dim3 grid2; + + if(sdata->atom.nall <= 256 * 64000) { + grid2.x = (sdata->atom.nall + 255) / 256; + grid2.y = 1; + } else { + grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128); grid2.y = 128; } - grid2.z = 1; + + grid2.z = 1; dim3 threads2; threads2.x = 256; threads2.y = 1; threads2.z = 1; - timespec time1,time2; + timespec time1, time2; //pre-calculate all neighbordistances and zeta_ij - clock_gettime(CLOCK_REALTIME,&time1); - Pair_SW_Kernel_TpA_RIJ<<>>(); + clock_gettime(CLOCK_REALTIME, &time1); + Pair_SW_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>(); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME,&time2); - sdata->cuda_timings.test1+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - clock_gettime(CLOCK_REALTIME,&time1); + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.test1 += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + clock_gettime(CLOCK_REALTIME, &time1); //actual force calculation - unsigned int sharedsize=(sharedperproc*sizeof(ENERGY_FLOAT)+4*sizeof(F_FLOAT))*threads.x; //extra 4 floats per thread used to reduce register pressure - if(eflag) - { + unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure + + if(eflag) { if(vflag) - Pair_SW_Kernel_TpA<1,1><<>> - (eflag_atom,vflag_atom); + Pair_SW_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); else - Pair_SW_Kernel_TpA<1,0><<>> - (eflag_atom,vflag_atom); - } - else - { + Pair_SW_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); + } else { if(vflag) - Pair_SW_Kernel_TpA<0,1><<>> - (eflag_atom,vflag_atom); + Pair_SW_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); else - Pair_SW_Kernel_TpA<0,0><<>> - (eflag_atom,vflag_atom); + Pair_SW_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); } cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME,&time2); - sdata->cuda_timings.test2+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.test2 += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_tersoff_cuda.cu b/lib/cuda/pair_tersoff_cuda.cu index abbf39ecf0..0ae5e846a0 100644 --- a/lib/cuda/pair_tersoff_cuda.cu +++ b/lib/cuda/pair_tersoff_cuda.cu @@ -1,22 +1,22 @@ /* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + Steve Plimpton, sjplimp@sandia.gov - See the README file in the top-level LAMMPS directory. + See the README file in the top-level LAMMPS directory. - ----------------------------------------------------------------------- + ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ + https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany + Theoretical Physics II, University of Technology Ilmenau, Germany - See the README file in the USER-CUDA directory. + See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ @@ -25,7 +25,7 @@ #include "pair_tersoff_cuda_cu.h" -__device__ __constant__ Param_Float params[MANYBODY_NPAIR*MANYBODY_NPAIR*MANYBODY_NPAIR]; +__device__ __constant__ Param_Float params[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR]; __device__ __constant__ F_FLOAT* _glob_zeta_ij; //zeta_ij __device__ __constant__ F_FLOAT4* _glob_r_ij; //r_ij (x,y,z,r^2) for pairs within force cutoff __device__ __constant__ bool _zbl; //is tersoff zbl? @@ -36,119 +36,118 @@ __device__ __constant__ bool _zbl; //is tersoff zbl? #include -void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata,Param_Float* params_host,void* map_host, void* elem2param_host,int nelements_h,bool zbl) +void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host, void* map_host, void* elem2param_host, int nelements_h, bool zbl) { unsigned cuda_ntypes = sdata->atom.ntypes + 1; - X_FLOAT box_size[3] = - { + X_FLOAT box_size[3] = { sdata->domain.subhi[0] - sdata->domain.sublo[0], sdata->domain.subhi[1] - sdata->domain.sublo[1], sdata->domain.subhi[2] - sdata->domain.sublo[2] }; - cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3); - cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 ); - cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) ); - cudaMemcpyToSymbol("params", params_host , sizeof(Param_Float)*nelements_h*nelements_h*nelements_h ); - cudaMemcpyToSymbol("elem2param",elem2param_host , sizeof(int)*nelements_h*nelements_h*nelements_h ); - cudaMemcpyToSymbol("map",map_host , sizeof(int)*cuda_ntypes ); - cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int)); - cudaMemcpyToSymbol("_zbl",&zbl,sizeof(bool)); + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3); + cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); + cudaMemcpyToSymbol(params, params_host , sizeof(Param_Float)*nelements_h * nelements_h * nelements_h); + cudaMemcpyToSymbol(elem2param, elem2param_host , sizeof(int)*nelements_h * nelements_h * nelements_h); + cudaMemcpyToSymbol(map, map_host , sizeof(int)*cuda_ntypes); + cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int)); + cudaMemcpyToSymbol(_zbl, &zbl, sizeof(bool)); } -void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) { - static F_FLOAT* glob_zeta_ij=NULL; - static int glob_zeta_ij_size=0; - static F_FLOAT4* glob_r_ij=NULL; - static int* glob_numneigh_red=NULL; - static int* glob_neighbors_red=NULL; - static int* glob_neightype_red=NULL; + static F_FLOAT* glob_zeta_ij = NULL; + static int glob_zeta_ij_size = 0; + static F_FLOAT4* glob_r_ij = NULL; + static int* glob_numneigh_red = NULL; + static int* glob_neighbors_red = NULL; + static int* glob_neightype_red = NULL; - if(glob_zeta_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT)) - { - glob_zeta_ij_size = sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT); - cudaFree(glob_zeta_ij); + if(glob_zeta_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) { + glob_zeta_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT); + cudaFree(glob_zeta_ij); cudaFree(glob_r_ij); cudaFree(glob_numneigh_red); cudaFree(glob_neighbors_red); cudaFree(glob_neightype_red); - cudaMalloc(&glob_zeta_ij,glob_zeta_ij_size); - cudaMalloc(&glob_r_ij,glob_zeta_ij_size*4); - cudaMalloc(&glob_numneigh_red,sdata->atom.nall*sizeof(int)); - cudaMalloc(&glob_neighbors_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int)); - cudaMalloc(&glob_neightype_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int)); - cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red , sizeof(int*) ); - cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red , sizeof(int*) ); - cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red , sizeof(int*) ); - cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij , sizeof(F_FLOAT4*) ); - cudaMemcpyToSymbol("_glob_zeta_ij", &glob_zeta_ij , sizeof(F_FLOAT*) ); - } - dim3 grid,threads; - int sharedperproc; + cudaMalloc(&glob_zeta_ij, glob_zeta_ij_size); + cudaMalloc(&glob_r_ij, glob_zeta_ij_size * 4); + cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int)); + cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int)); + cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int)); + cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*)); + cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*)); + cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*)); + cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*)); + cudaMemcpyToSymbol(_glob_zeta_ij, &glob_zeta_ij , sizeof(F_FLOAT*)); + } - Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,64); - cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64); + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); - dim3 grid2; - if(sdata->atom.nall<=256*64000){ - grid2.x = (sdata->atom.nall+255)/256; - grid2.y = 1; - } else { - grid2.x = (sdata->atom.nall+256*128-1)/(256*128); + dim3 grid2; + + if(sdata->atom.nall <= 256 * 64000) { + grid2.x = (sdata->atom.nall + 255) / 256; + grid2.y = 1; + } else { + grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128); grid2.y = 128; } - grid2.z = 1; + + grid2.z = 1; dim3 threads2; threads2.x = 256; threads2.y = 1; threads2.z = 1; - timespec time1,time2; + timespec time1, time2; //pre-calculate all neighbordistances and zeta_ij - clock_gettime(CLOCK_REALTIME,&time1); - Pair_Tersoff_Kernel_TpA_RIJ<<>> - (); + clock_gettime(CLOCK_REALTIME, &time1); + Pair_Tersoff_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>> + (); cudaThreadSynchronize(); - Pair_Tersoff_Kernel_TpA_ZetaIJ<<>> - (); + Pair_Tersoff_Kernel_TpA_ZetaIJ <<< grid2, threads2, 0, streams[1]>>> + (); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME,&time2); - sdata->cuda_timings.test1+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - clock_gettime(CLOCK_REALTIME,&time1); + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.test1 += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + clock_gettime(CLOCK_REALTIME, &time1); //actual force calculation - unsigned int sharedsize=(sharedperproc*sizeof(ENERGY_FLOAT)+4*sizeof(F_FLOAT))*threads.x; //extra 4 floats per thread used to reduce register pressure - if(eflag) - { + unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure + + if(eflag) { if(vflag) - Pair_Tersoff_Kernel_TpA<1,1><<>> - (eflag_atom,vflag_atom); + Pair_Tersoff_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); else - Pair_Tersoff_Kernel_TpA<1,0><<>> - (eflag_atom,vflag_atom); - } - else - { + Pair_Tersoff_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); + } else { if(vflag) - Pair_Tersoff_Kernel_TpA<0,1><<>> - (eflag_atom,vflag_atom); + Pair_Tersoff_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); else - Pair_Tersoff_Kernel_TpA<0,0><<>> - (eflag_atom,vflag_atom); + Pair_Tersoff_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); } cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME,&time2); - sdata->cuda_timings.test2+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.test2 += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pppm_cuda.cu b/lib/cuda/pppm_cuda.cu index cabea885d3..dd434b9bbf 100644 --- a/lib/cuda/pppm_cuda.cu +++ b/lib/cuda/pppm_cuda.cu @@ -1,22 +1,22 @@ /* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + Steve Plimpton, sjplimp@sandia.gov - See the README file in the top-level LAMMPS directory. + See the README file in the top-level LAMMPS directory. - ----------------------------------------------------------------------- + ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ + https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany + Theoretical Physics II, University of Technology Ilmenau, Germany - See the README file in the USER-CUDA directory. + See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ @@ -34,484 +34,493 @@ #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) - __device__ __constant__ FFT_FLOAT* work1; - __device__ __constant__ FFT_FLOAT* work2; - __device__ __constant__ FFT_FLOAT* work3; - __device__ __constant__ PPPM_FLOAT* greensfn; - __device__ __constant__ PPPM_FLOAT* gf_b; - __device__ __constant__ PPPM_FLOAT* fkx; - __device__ __constant__ PPPM_FLOAT* fky; - __device__ __constant__ PPPM_FLOAT* fkz; - __device__ __constant__ PPPM_FLOAT* vg; - __device__ __constant__ int* part2grid; - __device__ __constant__ PPPM_FLOAT* density_brick; - __device__ __constant__ int* density_brick_int; - __device__ __constant__ PPPM_FLOAT density_intScale; - __device__ __constant__ PPPM_FLOAT* vdx_brick; - __device__ __constant__ PPPM_FLOAT* vdy_brick; - __device__ __constant__ PPPM_FLOAT* vdz_brick; - __device__ __constant__ PPPM_FLOAT* density_fft; - __device__ __constant__ ENERGY_FLOAT* energy; - __device__ __constant__ ENERGY_FLOAT* virial; - __device__ __constant__ int nxlo_in; - __device__ __constant__ int nxhi_in; - __device__ __constant__ int nxlo_out; - __device__ __constant__ int nxhi_out; - __device__ __constant__ int nylo_in; - __device__ __constant__ int nyhi_in; - __device__ __constant__ int nylo_out; - __device__ __constant__ int nyhi_out; - __device__ __constant__ int nzlo_in; - __device__ __constant__ int nzhi_in; - __device__ __constant__ int nzlo_out; - __device__ __constant__ int nzhi_out; - __device__ __constant__ int nxlo_fft; - __device__ __constant__ int nxhi_fft; - __device__ __constant__ int nylo_fft; - __device__ __constant__ int nyhi_fft; - __device__ __constant__ int nzlo_fft; - __device__ __constant__ int nzhi_fft; - __device__ __constant__ int nx_pppm; - __device__ __constant__ int ny_pppm; - __device__ __constant__ int nz_pppm; - __device__ __constant__ int slabflag; - __device__ __constant__ PPPM_FLOAT qqrd2e; - __device__ __constant__ int order; - //__device__ __constant__ float3 sublo; - __device__ __constant__ PPPM_FLOAT* rho_coeff; - __device__ __constant__ int nmax; - __device__ __constant__ int nlocal; - __device__ __constant__ PPPM_FLOAT* debugdata; - __device__ __constant__ PPPM_FLOAT delxinv; - __device__ __constant__ PPPM_FLOAT delyinv; - __device__ __constant__ PPPM_FLOAT delzinv; - __device__ __constant__ int nlower; - __device__ __constant__ int nupper; - __device__ __constant__ PPPM_FLOAT shiftone; - - +__device__ __constant__ FFT_FLOAT* work1; +__device__ __constant__ FFT_FLOAT* work2; +__device__ __constant__ FFT_FLOAT* work3; +__device__ __constant__ PPPM_FLOAT* greensfn; +__device__ __constant__ PPPM_FLOAT* gf_b; +__device__ __constant__ PPPM_FLOAT* fkx; +__device__ __constant__ PPPM_FLOAT* fky; +__device__ __constant__ PPPM_FLOAT* fkz; +__device__ __constant__ PPPM_FLOAT* vg; +__device__ __constant__ int* part2grid; +__device__ __constant__ PPPM_FLOAT* density_brick; +__device__ __constant__ int* density_brick_int; +__device__ __constant__ PPPM_FLOAT density_intScale; +__device__ __constant__ PPPM_FLOAT* vdx_brick; +__device__ __constant__ PPPM_FLOAT* vdy_brick; +__device__ __constant__ PPPM_FLOAT* vdz_brick; +__device__ __constant__ PPPM_FLOAT* density_fft; +__device__ __constant__ ENERGY_FLOAT* energy; +__device__ __constant__ ENERGY_FLOAT* virial; +__device__ __constant__ int nxlo_in; +__device__ __constant__ int nxhi_in; +__device__ __constant__ int nxlo_out; +__device__ __constant__ int nxhi_out; +__device__ __constant__ int nylo_in; +__device__ __constant__ int nyhi_in; +__device__ __constant__ int nylo_out; +__device__ __constant__ int nyhi_out; +__device__ __constant__ int nzlo_in; +__device__ __constant__ int nzhi_in; +__device__ __constant__ int nzlo_out; +__device__ __constant__ int nzhi_out; +__device__ __constant__ int nxlo_fft; +__device__ __constant__ int nxhi_fft; +__device__ __constant__ int nylo_fft; +__device__ __constant__ int nyhi_fft; +__device__ __constant__ int nzlo_fft; +__device__ __constant__ int nzhi_fft; +__device__ __constant__ int nx_pppm; +__device__ __constant__ int ny_pppm; +__device__ __constant__ int nz_pppm; +__device__ __constant__ int slabflag; +__device__ __constant__ PPPM_FLOAT qqrd2e; +__device__ __constant__ int order; +//__device__ __constant__ float3 sublo; +__device__ __constant__ PPPM_FLOAT* rho_coeff; +__device__ __constant__ int nmax; +__device__ __constant__ int nlocal; +__device__ __constant__ PPPM_FLOAT* debugdata; +__device__ __constant__ PPPM_FLOAT delxinv; +__device__ __constant__ PPPM_FLOAT delyinv; +__device__ __constant__ PPPM_FLOAT delzinv; +__device__ __constant__ int nlower; +__device__ __constant__ int nupper; +__device__ __constant__ PPPM_FLOAT shiftone; + + #include "pppm_cuda_kernel.cu" #include "stdio.h" void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial - ,void* cu_work1,void* cu_work2, void* cu_work3,void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg - ,int cu_nxlo_in,int cu_nxhi_in,int cu_nylo_in,int cu_nyhi_in,int cu_nzlo_in,int cu_nzhi_in,int cu_nxlo_out,int cu_nxhi_out,int cu_nylo_out,int cu_nyhi_out,int cu_nzlo_out,int cu_nzhi_out,int cu_nx_pppm,int cu_ny_pppm,int cu_nz_pppm - ,int cu_nxlo_fft,int cu_nxhi_fft,int cu_nylo_fft,int cu_nyhi_fft,int cu_nzlo_fft,int cu_nzhi_fft,void* cu_gf_b - ,double cu_qqrd2e, int cu_order, void* cu_rho_coeff,void* cu_debugdata,void* cu_density_brick_int,int cu_slabflag - ) + , void* cu_work1, void* cu_work2, void* cu_work3, void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg + , int cu_nxlo_in, int cu_nxhi_in, int cu_nylo_in, int cu_nyhi_in, int cu_nzlo_in, int cu_nzhi_in, int cu_nxlo_out, int cu_nxhi_out, int cu_nylo_out, int cu_nyhi_out, int cu_nzlo_out, int cu_nzhi_out, int cu_nx_pppm, int cu_ny_pppm, int cu_nz_pppm + , int cu_nxlo_fft, int cu_nxhi_fft, int cu_nylo_fft, int cu_nyhi_fft, int cu_nzlo_fft, int cu_nzhi_fft, void* cu_gf_b + , double cu_qqrd2e, int cu_order, void* cu_rho_coeff, void* cu_debugdata, void* cu_density_brick_int, int cu_slabflag + ) { CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start"); - cudaMemcpyToSymbol("density_brick",&cu_density_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("density_brick_int",&cu_density_brick_int, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("vdx_brick",&cu_vdx_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("vdy_brick",&cu_vdy_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("vdz_brick",&cu_vdz_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("density_fft",&cu_density_fft, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("energy",&cu_energy, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol("virial",&cu_virial, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol("nxlo_in",&cu_nxlo_in, sizeof(int)); - cudaMemcpyToSymbol("nxhi_in",&cu_nxhi_in, sizeof(int)); - cudaMemcpyToSymbol("nxlo_out",&cu_nxlo_out, sizeof(int)); - cudaMemcpyToSymbol("nxhi_out",&cu_nxhi_out, sizeof(int)); - cudaMemcpyToSymbol("nylo_in",&cu_nylo_in, sizeof(int)); - cudaMemcpyToSymbol("nyhi_in",&cu_nyhi_in, sizeof(int)); - cudaMemcpyToSymbol("nylo_out",&cu_nylo_out, sizeof(int)); - cudaMemcpyToSymbol("nyhi_out",&cu_nyhi_out, sizeof(int)); - cudaMemcpyToSymbol("nzlo_in",&cu_nzlo_in, sizeof(int)); - cudaMemcpyToSymbol("nzhi_in",&cu_nzhi_in, sizeof(int)); - cudaMemcpyToSymbol("nzlo_out",&cu_nzlo_out, sizeof(int)); - cudaMemcpyToSymbol("nzhi_out",&cu_nzhi_out, sizeof(int)); - cudaMemcpyToSymbol("nxlo_fft",&cu_nxlo_fft, sizeof(int)); - cudaMemcpyToSymbol("nxhi_fft",&cu_nxhi_fft, sizeof(int)); - cudaMemcpyToSymbol("nylo_fft",&cu_nylo_fft, sizeof(int)); - cudaMemcpyToSymbol("nyhi_fft",&cu_nyhi_fft, sizeof(int)); - cudaMemcpyToSymbol("nzlo_fft",&cu_nzlo_fft, sizeof(int)); - cudaMemcpyToSymbol("nzhi_fft",&cu_nzhi_fft, sizeof(int)); - cudaMemcpyToSymbol("slabflag",&cu_slabflag, sizeof(int)); - cudaMemcpyToSymbol("nx_pppm",&cu_nx_pppm, sizeof(int)); - cudaMemcpyToSymbol("ny_pppm",&cu_ny_pppm, sizeof(int)); - cudaMemcpyToSymbol("nz_pppm",&cu_nz_pppm, sizeof(int)); - cudaMemcpyToSymbol("work1",&cu_work1, sizeof(FFT_FLOAT*)); - cudaMemcpyToSymbol("work2",&cu_work2, sizeof(FFT_FLOAT*)); - cudaMemcpyToSymbol("work3",&cu_work3, sizeof(FFT_FLOAT*)); - cudaMemcpyToSymbol("greensfn",&cu_greensfn, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("gf_b",&cu_gf_b, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("fkx",&cu_fkx, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("fky",&cu_fky, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("fkz",&cu_fkz, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("vg",&cu_vg, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(density_brick, &cu_density_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(density_brick_int, &cu_density_brick_int, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(vdx_brick, &cu_vdx_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(vdy_brick, &cu_vdy_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(vdz_brick, &cu_vdz_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(density_fft, &cu_density_fft, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(energy, &cu_energy, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(virial, &cu_virial, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(nxlo_in, &cu_nxlo_in, sizeof(int)); + cudaMemcpyToSymbol(nxhi_in, &cu_nxhi_in, sizeof(int)); + cudaMemcpyToSymbol(nxlo_out, &cu_nxlo_out, sizeof(int)); + cudaMemcpyToSymbol(nxhi_out, &cu_nxhi_out, sizeof(int)); + cudaMemcpyToSymbol(nylo_in, &cu_nylo_in, sizeof(int)); + cudaMemcpyToSymbol(nyhi_in, &cu_nyhi_in, sizeof(int)); + cudaMemcpyToSymbol(nylo_out, &cu_nylo_out, sizeof(int)); + cudaMemcpyToSymbol(nyhi_out, &cu_nyhi_out, sizeof(int)); + cudaMemcpyToSymbol(nzlo_in, &cu_nzlo_in, sizeof(int)); + cudaMemcpyToSymbol(nzhi_in, &cu_nzhi_in, sizeof(int)); + cudaMemcpyToSymbol(nzlo_out, &cu_nzlo_out, sizeof(int)); + cudaMemcpyToSymbol(nzhi_out, &cu_nzhi_out, sizeof(int)); + cudaMemcpyToSymbol(nxlo_fft, &cu_nxlo_fft, sizeof(int)); + cudaMemcpyToSymbol(nxhi_fft, &cu_nxhi_fft, sizeof(int)); + cudaMemcpyToSymbol(nylo_fft, &cu_nylo_fft, sizeof(int)); + cudaMemcpyToSymbol(nyhi_fft, &cu_nyhi_fft, sizeof(int)); + cudaMemcpyToSymbol(nzlo_fft, &cu_nzlo_fft, sizeof(int)); + cudaMemcpyToSymbol(nzhi_fft, &cu_nzhi_fft, sizeof(int)); + cudaMemcpyToSymbol(slabflag, &cu_slabflag, sizeof(int)); + cudaMemcpyToSymbol(nx_pppm, &cu_nx_pppm, sizeof(int)); + cudaMemcpyToSymbol(ny_pppm, &cu_ny_pppm, sizeof(int)); + cudaMemcpyToSymbol(nz_pppm, &cu_nz_pppm, sizeof(int)); + cudaMemcpyToSymbol(work1, &cu_work1, sizeof(FFT_FLOAT*)); + cudaMemcpyToSymbol(work2, &cu_work2, sizeof(FFT_FLOAT*)); + cudaMemcpyToSymbol(work3, &cu_work3, sizeof(FFT_FLOAT*)); + cudaMemcpyToSymbol(greensfn, &cu_greensfn, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(gf_b, &cu_gf_b, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(fkx, &cu_fkx, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(fky, &cu_fky, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(fkz, &cu_fkz, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(vg, &cu_vg, sizeof(PPPM_FLOAT*)); + + PPPM_FLOAT cu_qqrd2e_a = cu_qqrd2e; + cudaMemcpyToSymbol(qqrd2e, &cu_qqrd2e_a, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol(order, &cu_order, sizeof(int)); + cudaMemcpyToSymbol(rho_coeff, &cu_rho_coeff, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(debugdata, &cu_debugdata, sizeof(PPPM_FLOAT*)); - PPPM_FLOAT cu_qqrd2e_a=cu_qqrd2e; - cudaMemcpyToSymbol("qqrd2e",&cu_qqrd2e_a, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol("order",&cu_order, sizeof(int)); - cudaMemcpyToSymbol("rho_coeff",&cu_rho_coeff, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("debugdata",&cu_debugdata, sizeof(PPPM_FLOAT*)); - CUT_CHECK_ERROR("ERROR-CUDA poisson_init"); -/*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n"); + /*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n"); -#ifdef PPPM_PRECISION -if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n"); -if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n"); -#endif -#ifdef ENERGY_PRECISION -if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n"); -if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n"); -#endif -#ifdef ENERGY_PRECISION -if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n"); -if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n"); -#endif -#ifdef X_PRECISION -if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n"); -if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n"); -#endif -#ifdef F_PRECISION -if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n"); -if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n"); -#endif*/ + #ifdef PPPM_PRECISION + if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n"); + if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n"); + #endif + #ifdef ENERGY_PRECISION + if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n"); + if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n"); + #endif + #ifdef ENERGY_PRECISION + if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n"); + if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n"); + #endif + #ifdef X_PRECISION + if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n"); + if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n"); + #endif + #ifdef F_PRECISION + if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n"); + if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n"); + #endif*/ } -void pppm_device_init_setup(cuda_shared_data* sdata,PPPM_FLOAT cu_shiftone,PPPM_FLOAT cu_delxinv,PPPM_FLOAT cu_delyinv,PPPM_FLOAT cu_delzinv,int cu_nlower,int cu_nupper) +void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_FLOAT cu_shiftone, PPPM_FLOAT cu_delxinv, PPPM_FLOAT cu_delyinv, PPPM_FLOAT cu_delzinv, int cu_nlower, int cu_nupper) { - cudaMemcpyToSymbol("delxinv",&cu_delxinv, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol("delyinv",&cu_delyinv, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol("delzinv",&cu_delzinv, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol("shiftone",&cu_shiftone, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol("nlower",&cu_nlower, sizeof(int)); - cudaMemcpyToSymbol("nupper",&cu_nupper, sizeof(int)); - cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo, 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi, 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(boxlo) , sdata->domain.boxlo, 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(delxinv, &cu_delxinv, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol(delyinv, &cu_delyinv, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol(delzinv, &cu_delzinv, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol(shiftone, &cu_shiftone, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol(nlower, &cu_nlower, sizeof(int)); + cudaMemcpyToSymbol(nupper, &cu_nupper, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo, 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi, 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo, 3 * sizeof(X_FLOAT)); CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup"); } -void pppm_device_update(cuda_shared_data* sdata,void* cu_part2grid, int nlocala,int nmaxa) +void pppm_device_update(cuda_shared_data* sdata, void* cu_part2grid, int nlocala, int nmaxa) { - cudaMemcpyToSymbol("part2grid",&cu_part2grid, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); - //cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int)); - cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int)); - cudaMemcpyToSymbol("nmax" , &nmaxa, sizeof(int)); + cudaMemcpyToSymbol("part2grid", &cu_part2grid, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + //cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int)); + cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int)); + cudaMemcpyToSymbol(nmax , &nmaxa, sizeof(int)); CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update"); - + } void pppm_update_nlocal(int nlocala) { - cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int)); - CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b"); + cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b"); } -void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald) +void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald) { dim3 grid; dim3 threads; - grid.x=nz_pppma; - grid.y=ny_pppma; - grid.z=1; - threads.x=nx_pppma; - threads.y=1; - threads.z=1; - setup_fkxyz_vg<<>>(unitkx,unitky,unitkz,g_ewald); + grid.x = nz_pppma; + grid.y = ny_pppma; + grid.z = 1; + threads.x = nx_pppma; + threads.y = 1; + threads.z = 1; + setup_fkxyz_vg <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald); cudaThreadSynchronize(); - + CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg "); } -void Cuda_PPPM_setup_greensfn(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald, -int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab) +void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald, + int nbx, int nby, int nbz, PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab) { dim3 grid; dim3 threads; - grid.x=nz_pppma; - grid.y=ny_pppma; - grid.z=1; - threads.x=nx_pppma; - threads.y=1; - threads.z=1; - setup_greensfn<<>>(unitkx,unitky,unitkz,g_ewald,nbx,nby,nbz,xprd,yprd, zprd_slab); + grid.x = nz_pppma; + grid.y = ny_pppma; + grid.z = 1; + threads.x = nx_pppma; + threads.y = 1; + threads.z = 1; + setup_greensfn <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald, nbx, nby, nbz, xprd, yprd, zprd_slab); cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn "); } -void poisson_scale(int nx_pppma,int ny_pppma,int nz_pppma) +void poisson_scale(int nx_pppma, int ny_pppma, int nz_pppma) { dim3 grid; dim3 threads; - grid.x=nz_pppma; - grid.y=ny_pppma; - grid.z=1; - threads.x=nx_pppma; - threads.y=1; - threads.z=1; - poisson_scale_kernel<<>>(); + grid.x = nz_pppma; + grid.y = ny_pppma; + grid.z = 1; + threads.x = nx_pppma; + threads.y = 1; + threads.z = 1; + poisson_scale_kernel <<< grid, threads, 0>>>(); CUT_CHECK_ERROR("ERROR-CUDA poisson_scale "); } -void poisson_xgrad(int nx_pppma,int ny_pppma,int nz_pppma) +void poisson_xgrad(int nx_pppma, int ny_pppma, int nz_pppma) { dim3 grid; dim3 threads; - grid.x=nz_pppma; - grid.y=ny_pppma; - grid.z=1; - threads.x=nx_pppma; - threads.y=1; - threads.z=1; - poisson_xgrad_kernel<<>>(); + grid.x = nz_pppma; + grid.y = ny_pppma; + grid.z = 1; + threads.x = nx_pppma; + threads.y = 1; + threads.z = 1; + poisson_xgrad_kernel <<< grid, threads, 0>>>(); CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad "); } -void poisson_ygrad(int nx_pppma,int ny_pppma,int nz_pppma) +void poisson_ygrad(int nx_pppma, int ny_pppma, int nz_pppma) { dim3 grid; dim3 threads; - grid.x=nz_pppma; - grid.y=ny_pppma; - grid.z=1; - threads.x=nx_pppma; - threads.y=1; - threads.z=1; - poisson_ygrad_kernel<<>>(); + grid.x = nz_pppma; + grid.y = ny_pppma; + grid.z = 1; + threads.x = nx_pppma; + threads.y = 1; + threads.z = 1; + poisson_ygrad_kernel <<< grid, threads, 0>>>(); CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad "); } -void poisson_zgrad(int nx_pppma,int ny_pppma,int nz_pppma) +void poisson_zgrad(int nx_pppma, int ny_pppma, int nz_pppma) { dim3 grid; dim3 threads; - grid.x=nz_pppma; - grid.y=ny_pppma; - grid.z=1; - threads.x=nx_pppma; - threads.y=1; - threads.z=1; - poisson_zgrad_kernel<<>>(); + grid.x = nz_pppma; + grid.y = ny_pppma; + grid.z = 1; + threads.x = nx_pppma; + threads.y = 1; + threads.z = 1; + poisson_zgrad_kernel <<< grid, threads, 0>>>(); CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad "); } -void poisson_vdx_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppma,int ny_pppma,int nz_pppma) +void poisson_vdx_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppma, int ny_pppma, int nz_pppma) { - + dim3 grid; dim3 threads; - grid.x=khi-klo+1; - grid.y=jhi-jlo+1; - grid.z=1; - threads.x=ihi-ilo+1; - threads.y=1; - threads.z=1; + grid.x = khi - klo + 1; + grid.y = jhi - jlo + 1; + grid.z = 1; + threads.x = ihi - ilo + 1; + threads.y = 1; + threads.z = 1; //printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x); - poisson_vdx_brick_kernel<<>>(ilo,jlo,klo); + poisson_vdx_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo); CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick "); cudaThreadSynchronize(); } -void poisson_vdy_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm) +void poisson_vdy_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm) { dim3 grid; dim3 threads; - grid.x=khi-klo+1; - grid.y=jhi-jlo+1; - grid.z=1; - threads.x=ihi-ilo+1; - threads.y=1; - threads.z=1; - poisson_vdy_brick_kernel<<>>(ilo,jlo,klo); + grid.x = khi - klo + 1; + grid.y = jhi - jlo + 1; + grid.z = 1; + threads.x = ihi - ilo + 1; + threads.y = 1; + threads.z = 1; + poisson_vdy_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo); CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick "); cudaThreadSynchronize(); } -void poisson_vdz_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm) +void poisson_vdz_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm) { dim3 grid; dim3 threads; - grid.x=khi-klo+1; - grid.y=jhi-jlo+1; - grid.z=1; - threads.x=ihi-ilo+1; - threads.y=1; - threads.z=1; - poisson_vdz_brick_kernel<<>>(ilo,jlo,klo); + grid.x = khi - klo + 1; + grid.y = jhi - jlo + 1; + grid.z = 1; + threads.x = ihi - ilo + 1; + threads.y = 1; + threads.z = 1; + poisson_vdz_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo); CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick "); cudaThreadSynchronize(); } -void poisson_energy(int nxlo_fft,int nxhi_fft,int nylo_fft,int nyhi_fft,int nzlo_fft,int nzhi_fft,int vflag) +void poisson_energy(int nxlo_fft, int nxhi_fft, int nylo_fft, int nyhi_fft, int nzlo_fft, int nzhi_fft, int vflag) { //printf("VFLAG_GPU: %i\n",vflag); CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start "); dim3 grid; dim3 threads; - grid.x=nzhi_fft-nzlo_fft+1; - grid.y=nyhi_fft-nylo_fft+1; - grid.z=1; - threads.x=nxhi_fft-nxlo_fft+1; - threads.y=1; - threads.z=1; - poisson_energy_kernel<<>>(nxlo_fft,nylo_fft,nzlo_fft,vflag); - + grid.x = nzhi_fft - nzlo_fft + 1; + grid.y = nyhi_fft - nylo_fft + 1; + grid.z = 1; + threads.x = nxhi_fft - nxlo_fft + 1; + threads.y = 1; + threads.z = 1; + poisson_energy_kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(nxlo_fft, nylo_fft, nzlo_fft, vflag); + cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end "); } -ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppma,int nz_pppma,int vflag,ENERGY_FLOAT* cpu_virial) +ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_FLOAT* cpu_virial) { - ENERGY_FLOAT host_energy=0; + ENERGY_FLOAT host_energy = 0; dim3 grid; dim3 threads; - grid.x=nz_pppma; - grid.y=1; - grid.z=1; - threads.x=ny_pppma; - threads.y=1; - threads.z=1; - sum_energy_kernel1<<>>(vflag); + grid.x = nz_pppma; + grid.y = 1; + grid.z = 1; + threads.x = ny_pppma; + threads.y = 1; + threads.z = 1; + sum_energy_kernel1 <<< grid, threads, ny_pppma* sizeof(ENERGY_FLOAT)>>>(vflag); cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 "); - grid.x=1; - grid.y=1; - grid.z=1; - threads.x=nz_pppma; - threads.y=1; - threads.z=1; - sum_energy_kernel2<<>>(vflag); + grid.x = 1; + grid.y = 1; + grid.z = 1; + threads.x = nz_pppma; + threads.y = 1; + threads.z = 1; + sum_energy_kernel2 <<< grid, threads, nz_pppma* sizeof(ENERGY_FLOAT)>>>(vflag); cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 "); - cudaMemcpy((void*) (&host_energy), cu_energy, sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); + cudaMemcpy((void*)(&host_energy), cu_energy, sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost); + if(vflag) - cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); + cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6 * sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost); CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy"); - + return host_energy; } -void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_intScale,int ihi,int ilo,int jhi,int jlo,int khi,int klo,void* cu_density_brick,void* cu_density_brick_int) +void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_FLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int) { CUT_CHECK_ERROR("cuda_make_rho begin"); - dim3 grid,threads; + dim3 grid, threads; int cpu_flag[3]; - grid.x=(sdata->atom.nlocal+31)/32; - grid.y=1; - grid.z=1; - threads.x=32; - threads.y=1; - threads.z=1; - int sharedmemsize=(32+32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT); - do - { - cpu_flag[0]=0; - cpu_flag[1]=0; - cpu_flag[2]=0; - cudaMemcpyToSymbol("density_intScale",cu_density_intScale,sizeof(PPPM_FLOAT*)); + grid.x = (sdata->atom.nlocal + 31) / 32; + grid.y = 1; + grid.z = 1; + threads.x = 32; + threads.y = 1; + threads.z = 1; + int sharedmemsize = (32 + 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT); + + do { + cpu_flag[0] = 0; + cpu_flag[1] = 0; + cpu_flag[2] = 0; + cudaMemcpyToSymbol("density_intScale", cu_density_intScale, sizeof(PPPM_FLOAT*)); CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z"); - cudaMemset(flag,0,3*sizeof(int)); + cudaMemset(flag, 0, 3 * sizeof(int)); CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A"); - cudaMemset(cu_density_brick,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(PPPM_FLOAT)); + cudaMemset(cu_density_brick, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(PPPM_FLOAT)); CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B"); - cudaMemset(cu_density_brick_int,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(int)); + cudaMemset(cu_density_brick_int, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(int)); CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C"); - make_rho_kernel<<>>((int*) flag,32/(sdata->pppm.nupper-sdata->pppm.nlower+1)); + make_rho_kernel <<< grid, threads, sharedmemsize>>>((int*) flag, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1)); cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA make_rho A"); - cudaMemcpy((void*) &cpu_flag, flag, 3*sizeof(int),cudaMemcpyDeviceToHost); - if(cpu_flag[0]!=0) {(*cu_density_intScale)/=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n",*cu_density_intScale);)} - if((cpu_flag[0]==0)&&(cpu_flag[1]==0)) {(*cu_density_intScale)*=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n",*cu_density_intScale);)} - /* if((*cu_density_intScale)>0xe0000000) - { - printf("Error Scaling\n"); - cpu_flag[0]=0; - cpu_flag[1]=1; - }*/ + cudaMemcpy((void*) &cpu_flag, flag, 3 * sizeof(int), cudaMemcpyDeviceToHost); + + if(cpu_flag[0] != 0) { + (*cu_density_intScale) /= 2; + MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n", *cu_density_intScale);) + } + if((cpu_flag[0] == 0) && (cpu_flag[1] == 0)) { + (*cu_density_intScale) *= 2; + MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n", *cu_density_intScale);) + } + /* if((*cu_density_intScale)>0xe0000000) + { + printf("Error Scaling\n"); + cpu_flag[0]=0; + cpu_flag[1]=1; + }*/ CUT_CHECK_ERROR("ERROR-CUDA make_rho B"); - } while((cpu_flag[0]!=0)||(cpu_flag[1]==0)); - - - grid.x=khi-klo+1; - grid.y=jhi-jlo+1; - threads.x=ihi-ilo+1; - scale_rho_kernel<<>>(); + } while((cpu_flag[0] != 0) || (cpu_flag[1] == 0)); + + + grid.x = khi - klo + 1; + grid.y = jhi - jlo + 1; + threads.x = ihi - ilo + 1; + scale_rho_kernel <<< grid, threads, 0>>>(); cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale"); } -int cuda_particle_map(cuda_shared_data* sdata,void* flag) +int cuda_particle_map(cuda_shared_data* sdata, void* flag) { - dim3 grid,threads; + dim3 grid, threads; int cpu_flag; - grid.x=(sdata->atom.nlocal+31)/32; - grid.y=1; - grid.z=1; - threads.x=32; - threads.y=1; - threads.z=1; + grid.x = (sdata->atom.nlocal + 31) / 32; + grid.y = 1; + grid.z = 1; + threads.x = 32; + threads.y = 1; + threads.z = 1; CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre"); - particle_map_kernel<<>>((int*) flag); + particle_map_kernel <<< grid, threads, 0>>>((int*) flag); cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA particla_map a"); - cudaMemcpy((void*) &cpu_flag, flag, sizeof(int),cudaMemcpyDeviceToHost); + cudaMemcpy((void*) &cpu_flag, flag, sizeof(int), cudaMemcpyDeviceToHost); CUT_CHECK_ERROR("ERROR-CUDA particla_map b"); return cpu_flag; } -void cuda_fieldforce(cuda_shared_data* sdata,void* flag) +void cuda_fieldforce(cuda_shared_data* sdata, void* flag) { - dim3 grid,threads; - grid.x=(sdata->atom.nlocal+31)/32; - grid.y=1; - grid.z=1; - threads.x=32; - threads.y=1; - threads.z=1; - int sharedmemsize=(32+3*32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT); - fieldforce_kernel<<>> - (sdata->pppm.nupper-sdata->pppm.nlower+1,32/(sdata->pppm.nupper-sdata->pppm.nlower+1),(int*) flag); + dim3 grid, threads; + grid.x = (sdata->atom.nlocal + 31) / 32; + grid.y = 1; + grid.z = 1; + threads.x = 32; + threads.y = 1; + threads.z = 1; + int sharedmemsize = (32 + 3 * 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT); + fieldforce_kernel <<< grid, threads, sharedmemsize>>> + (sdata->pppm.nupper - sdata->pppm.nlower + 1, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1), (int*) flag); cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA fieldforce"); } double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf) { - dim3 grid,threads; - grid.x=(sdata->atom.nlocal+31)/32; - grid.y=1; - grid.z=1; - threads.x=32; - threads.y=1; - threads.z=1; - slabcorr_energy_kernel<<>>(dev_buf); + dim3 grid, threads; + grid.x = (sdata->atom.nlocal + 31) / 32; + grid.y = 1; + grid.z = 1; + threads.x = 32; + threads.y = 1; + threads.z = 1; + slabcorr_energy_kernel <<< grid, threads, 32* sizeof(ENERGY_FLOAT)>>>(dev_buf); cudaThreadSynchronize(); - cudaMemcpy((void*) buf, dev_buf, grid.x*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); - - double dipole_all=0.0; - for(int i=0;iatom.nlocal+31)/32; - grid.y=1; - grid.z=1; - threads.x=32; - threads.y=1; - threads.z=1; - slabcorr_force_kernel<<>>(ffact); + dim3 grid, threads; + grid.x = (sdata->atom.nlocal + 31) / 32; + grid.y = 1; + grid.z = 1; + threads.x = 32; + threads.y = 1; + threads.z = 1; + slabcorr_force_kernel <<< grid, threads>>>(ffact); cudaThreadSynchronize(); } @@ -519,59 +528,59 @@ void sum_virial(double* host_virial) { } -void pppm_initfftdata(cuda_shared_data* sdata,PPPM_FLOAT* in,FFT_FLOAT* out) +void pppm_initfftdata(cuda_shared_data* sdata, PPPM_FLOAT* in, FFT_FLOAT* out) { - int nslow=sdata->pppm.nzhi_in-sdata->pppm.nzlo_in; - int nmid=sdata->pppm.nyhi_in-sdata->pppm.nylo_in; - int nfast=sdata->pppm.nxhi_in-sdata->pppm.nxlo_in; - int nrimz=MAX(sdata->pppm.nzlo_in-sdata->pppm.nzlo_out,sdata->pppm.nzhi_out-sdata->pppm.nzhi_in); - int nrimy=MAX(sdata->pppm.nylo_in-sdata->pppm.nylo_out,sdata->pppm.nyhi_out-sdata->pppm.nyhi_in); - int nrimx=MAX(sdata->pppm.nxlo_in-sdata->pppm.nxlo_out,sdata->pppm.nxhi_out-sdata->pppm.nxhi_in); + int nslow = sdata->pppm.nzhi_in - sdata->pppm.nzlo_in; + int nmid = sdata->pppm.nyhi_in - sdata->pppm.nylo_in; + int nfast = sdata->pppm.nxhi_in - sdata->pppm.nxlo_in; + int nrimz = MAX(sdata->pppm.nzlo_in - sdata->pppm.nzlo_out, sdata->pppm.nzhi_out - sdata->pppm.nzhi_in); + int nrimy = MAX(sdata->pppm.nylo_in - sdata->pppm.nylo_out, sdata->pppm.nyhi_out - sdata->pppm.nyhi_in); + int nrimx = MAX(sdata->pppm.nxlo_in - sdata->pppm.nxlo_out, sdata->pppm.nxhi_out - sdata->pppm.nxhi_in); dim3 grid; - grid.x=nslow+1; - grid.y=nmid+1; - grid.z=1; + grid.x = nslow + 1; + grid.y = nmid + 1; + grid.z = 1; dim3 threads; - threads.x=nfast+1; - threads.y=1; - threads.z=1; + threads.x = nfast + 1; + threads.y = 1; + threads.z = 1; cudaThreadSynchronize(); - initfftdata_core_kernel<<>>(in,out); + initfftdata_core_kernel <<< grid, threads, 0>>>(in, out); cudaThreadSynchronize(); - grid.x=nrimz; - grid.y=nmid+1; - threads.x=nfast+1; - initfftdata_z_kernel<<>>(in,out); + grid.x = nrimz; + grid.y = nmid + 1; + threads.x = nfast + 1; + initfftdata_z_kernel <<< grid, threads, 0>>>(in, out); cudaThreadSynchronize(); - grid.x=nslow+1; - grid.y=nrimy; - threads.x=nfast+1; - initfftdata_y_kernel<<>>(in,out); + grid.x = nslow + 1; + grid.y = nrimy; + threads.x = nfast + 1; + initfftdata_y_kernel <<< grid, threads, 0>>>(in, out); cudaThreadSynchronize(); - grid.x=nslow+1; - grid.y=nmid+1; - threads.x=nrimx; - initfftdata_x_kernel<<>>(in,out); + grid.x = nslow + 1; + grid.y = nmid + 1; + threads.x = nrimx; + initfftdata_x_kernel <<< grid, threads, 0>>>(in, out); cudaThreadSynchronize(); - grid.x=nrimz; - grid.y=nrimy; - threads.x=nfast+1; - initfftdata_yz_kernel<<>>(in,out); + grid.x = nrimz; + grid.y = nrimy; + threads.x = nfast + 1; + initfftdata_yz_kernel <<< grid, threads, 0>>>(in, out); cudaThreadSynchronize(); - grid.x=nrimz; - grid.y=nmid+1; - threads.x=nrimx; - initfftdata_xz_kernel<<>>(in,out); + grid.x = nrimz; + grid.y = nmid + 1; + threads.x = nrimx; + initfftdata_xz_kernel <<< grid, threads, 0>>>(in, out); cudaThreadSynchronize(); - grid.x=nslow+1; - grid.y=nrimy; - threads.x=nrimx; - initfftdata_xy_kernel<<>>(in,out); + grid.x = nslow + 1; + grid.y = nrimy; + threads.x = nrimx; + initfftdata_xy_kernel <<< grid, threads, 0>>>(in, out); cudaThreadSynchronize(); - grid.x=nrimz; - grid.y=nrimy; - threads.x=nrimx; - initfftdata_xyz_kernel<<>>(in,out); + grid.x = nrimz; + grid.y = nrimy; + threads.x = nrimx; + initfftdata_xyz_kernel <<< grid, threads, 0>>>(in, out); cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel"); }