git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8904 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2012-10-04 14:58:13 +00:00
parent 1f9963cb32
commit 4895715bdd
7 changed files with 2079 additions and 1992 deletions

View File

@ -14,7 +14,8 @@ SHELL = /bin/sh
# System-specific settings # System-specific settings
CUDA_INSTALL_PATH = /usr/local/cuda #CUDA_INSTALL_PATH = /usr/local/cuda
CUDA_INSTALL_PATH = /home/crtrott/lib/cuda
# e.g. in Gentoo # e.g. in Gentoo
# CUDA_INSTALL_PATH = /opt/cuda # CUDA_INSTALL_PATH = /opt/cuda
@ -97,8 +98,22 @@ else
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_21 SMVERSIONFLAGS := -arch sm_21
else else
CUDA_FLAGS += -DCUDA_ARCH=99 ifeq ($(strip $(arch)), 30)
SMVERSIONFLAGS := -arch sm_13 CUDA_FLAGS += -DCUDA_ARCH=20
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_30
else
ifeq ($(strip $(arch)), 35)
CUDA_FLAGS += -DCUDA_ARCH=20
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_35
else
CUDA_FLAGS += -DCUDA_ARCH=99
SMVERSIONFLAGS := -arch sm_13
endif
endif
endif endif
endif endif
endif endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,22 +1,22 @@
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version: Original Version:
http://lammps.sandia.gov, Sandia National Laboratories http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory. See the README file in the top-level LAMMPS directory.
----------------------------------------------------------------------- -----------------------------------------------------------------------
USER-CUDA Package and associated modifications: USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/ https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory. See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License. This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
@ -54,17 +54,17 @@ __device__ __constant__ F_FLOAT* MY_AP(fp);
#define _rhor_spline_tex MY_AP(rhor_spline_tex) #define _rhor_spline_tex MY_AP(rhor_spline_tex)
#if F_PRECISION == 1 #if F_PRECISION == 1
texture<float4,1> _rhor_spline_tex; texture<float4, 1> _rhor_spline_tex;
#else #else
texture<int4,1> _rhor_spline_tex; texture<int4, 1> _rhor_spline_tex;
#endif #endif
#define _z2r_spline_tex MY_AP(z2r_spline_tex) #define _z2r_spline_tex MY_AP(z2r_spline_tex)
#if F_PRECISION == 1 #if F_PRECISION == 1
texture<float4,1> _z2r_spline_tex; texture<float4, 1> _z2r_spline_tex;
#else #else
texture<int4,1> _z2r_spline_tex; texture<int4, 1> _z2r_spline_tex;
#endif #endif
@ -85,243 +85,258 @@ inline void BindEAMTextures(cuda_shared_data* sdata)
_rhor_spline_tex.normalized = false; // access with normalized texture coordinates _rhor_spline_tex.normalized = false; // access with normalized texture coordinates
_rhor_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no _rhor_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_rhor_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates _rhor_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* rhor_spline_texture_ptr;
cudaGetTextureReference(&rhor_spline_texture_ptr, MY_CONST(rhor_spline_tex));
#if F_PRECISION == 1 const textureReference* rhor_spline_texture_ptr = &MY_AP(rhor_spline_tex);
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<float4>(); cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<float4>();
cudaBindTexture(0,rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size); cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
#else #else
cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<int4>(); cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<int4>();
cudaBindTexture(0,rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size); cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
#endif #endif
_z2r_spline_tex.normalized = false; // access with normalized texture coordinates _z2r_spline_tex.normalized = false; // access with normalized texture coordinates
_z2r_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no _z2r_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_z2r_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates _z2r_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* z2r_spline_texture_ptr; const textureReference* z2r_spline_texture_ptr = &MY_AP(z2r_spline_tex);
cudaGetTextureReference(&z2r_spline_texture_ptr, MY_CONST(z2r_spline_tex));
#if F_PRECISION == 1 #if F_PRECISION == 1
cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<float4>(); cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<float4>();
cudaBindTexture(0,z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size); cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
#else #else
cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<int4>(); cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<int4>();
cudaBindTexture(0,z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size); cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
#endif #endif
} }
void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{ {
CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed"); CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed");
int3 layout=getgrid(sneighlist->inum,7*sizeof(F_FLOAT)); int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_FLOAT));
dim3 threads(layout.z, 1, 1); dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1); dim3 grid(layout.x, layout.y, 1);
int size=(unsigned)(layout.y*layout.x)*7*sizeof(F_FLOAT); int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_FLOAT);
if(sdata->buffersize<size)
{ if(sdata->buffersize < size) {
MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);) MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
if(sdata->buffer!=NULL) cudaFree(sdata->buffer);
cudaMalloc((void**)&sdata->buffer,size); if(sdata->buffer != NULL) cudaFree(sdata->buffer);
sdata->buffersize=size;
sdata->buffer_new++; cudaMalloc((void**)&sdata->buffer, size);
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) sdata->buffersize = size;
} sdata->buffer_new++;
cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed"); }
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed");
} }
void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{ {
cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned));
cudaMemcpyToSymbol(MY_CONST(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) ); cudaMemcpyToSymbol(MY_AP(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int));
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) ); cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int));
} }
void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{ {
CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed"); CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed");
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) ); cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) ); cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) ); cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) ); cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed"); CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed");
} }
void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata,double rdr,double rdrho,int nfrho, int nrhor,int nr, int nrho,int nz2r, void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, int nfrho, int nrhor, int nr, int nrho, int nz2r,
void* frho_spline,void* rhor_spline,void* z2r_spline,void* rho,void* fp, void* frho_spline, void* rhor_spline, void* z2r_spline, void* rho, void* fp,
int* type2frho,int** type2z2r,int** type2rhor) int* type2frho, int** type2z2r, int** type2rhor)
{ {
// !! LAMMPS indexes atom types starting with 1 !! // !! LAMMPS indexes atom types starting with 1 !!
unsigned cuda_ntypes = sdata->atom.ntypes + 1; unsigned cuda_ntypes = sdata->atom.ntypes + 1;
if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2)
printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u " if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2)
"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 " printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u "
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2); "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 "
unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes; "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes;
X_FLOAT cutsq_global;
cutsq_global = (X_FLOAT)(sdata->pair.cut_global);
cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT));
F_FLOAT* coeff_buf = new F_FLOAT[cuda_ntypes * cuda_ntypes];
for(int i = 0; i < cuda_ntypes; i++) coeff_buf[i] = type2frho[i];
cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes * sizeof(F_FLOAT));
for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2rhor[0][0])[i];
cudaMemcpyToSymbol(MY_AP(coeff2) , coeff_buf , nI);
for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2z2r[0][0])[i];
cudaMemcpyToSymbol(MY_AP(coeff3) , coeff_buf , nI);
delete [] coeff_buf;
X_FLOAT box_size[3] = {
sdata->domain.subhi[0] - sdata->domain.sublo[0],
sdata->domain.subhi[1] - sdata->domain.sublo[1],
sdata->domain.subhi[2] - sdata->domain.sublo[2]
};
F_FLOAT rdr_F = rdr;
F_FLOAT rdrho_F = rdrho;
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3);
cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(nr), &nr, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nrho), &nrho, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nfrho), &nfrho, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
rhor_spline_pointer = rhor_spline;
z2r_spline_pointer = z2r_spline;
CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed");
X_FLOAT cutsq_global;
cutsq_global = (X_FLOAT) (sdata->pair.cut_global);
cudaMemcpyToSymbol(MY_CONST(cutsq_global) ,&cutsq_global , sizeof(X_FLOAT) );
F_FLOAT* coeff_buf=new F_FLOAT[cuda_ntypes*cuda_ntypes];
for(int i=0;i<cuda_ntypes;i++) coeff_buf[i]=type2frho[i];
cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes*sizeof(F_FLOAT) );
for(int i=0;i<cuda_ntypes*cuda_ntypes;i++) coeff_buf[i]=(&type2rhor[0][0])[i];
cudaMemcpyToSymbol(MY_AP(coeff2) , coeff_buf , nI );
for(int i=0;i<cuda_ntypes*cuda_ntypes;i++) coeff_buf[i]=(&type2z2r[0][0])[i];
cudaMemcpyToSymbol(MY_AP(coeff3) , coeff_buf , nI );
delete [] coeff_buf;
X_FLOAT box_size[3] =
{
sdata->domain.subhi[0] - sdata->domain.sublo[0],
sdata->domain.subhi[1] - sdata->domain.sublo[1],
sdata->domain.subhi[2] - sdata->domain.sublo[2]
};
F_FLOAT rdr_F=rdr;
F_FLOAT rdrho_F=rdrho;
cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3);
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes), & cuda_ntypes , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity, sizeof(int)*3 );
cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(rdr), &rdr_F, sizeof(F_FLOAT) );
cudaMemcpyToSymbol(MY_CONST(rdrho), &rdrho_F, sizeof(F_FLOAT) );
cudaMemcpyToSymbol(MY_CONST(nr), &nr, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nrho), &nrho, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nfrho), &nfrho, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(rho), &rho, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(fp), &fp, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(frho_spline), &frho_spline, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(rhor_spline), &rhor_spline, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(z2r_spline), &z2r_spline, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int) );
rhor_spline_size = nrhor*(nr+1)*EAM_COEFF_LENGTH*sizeof(F_FLOAT);
z2r_spline_size = nz2r*(nr+1)*EAM_COEFF_LENGTH*sizeof(F_FLOAT);
rhor_spline_pointer = rhor_spline;
z2r_spline_pointer = z2r_spline;
CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed");
} }
void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
{ {
if(sdata->atom.update_nmax) if(sdata->atom.update_nmax)
Cuda_PairEAMCuda_UpdateNmax(sdata,sneighlist); Cuda_PairEAMCuda_UpdateNmax(sdata, sneighlist);
if(sdata->atom.update_neigh) if(sdata->atom.update_neigh)
Cuda_PairEAMCuda_UpdateNeighbor(sdata,sneighlist); Cuda_PairEAMCuda_UpdateNeighbor(sdata, sneighlist);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
if(sdata->buffer_new)
Cuda_PairEAMCuda_UpdateBuffer(sdata,sneighlist);
cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) );
int sharedperproc=0; if(sdata->atom.update_nlocal)
if(eflag||eflag_atom) sharedperproc=1; cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(vflag||vflag_atom) sharedperproc=7;
int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT)); if(sdata->buffer_new)
dim3 threads(layout.z, 1, 1); Cuda_PairEAMCuda_UpdateBuffer(sdata, sneighlist);
dim3 grid(layout.x, layout.y, 1);
eam_buff_offset=grid.x*grid.y;
BindXTypeTexture(sdata);
BindEAMTextures( sdata);// initialize only on first call
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
MYDBG( printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n",eflag,vflag); ) cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation");
PairEAMCuda_Kernel1<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag,eflag_atom,vflag_atom); int sharedperproc = 0;
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed"); if(eflag || eflag_atom) sharedperproc = 1;
if(vflag || vflag_atom) sharedperproc = 7;
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
eam_buff_offset = grid.x * grid.y;
BindXTypeTexture(sdata);
BindEAMTextures(sdata); // initialize only on first call
MYDBG( printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n"); ) MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);)
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation");
PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed");
MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");)
} }
void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
{ {
int sharedperproc=0; int sharedperproc = 0;
if(eflag||eflag_atom) sharedperproc=1;
if(vflag||vflag_atom) sharedperproc=7;
int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
BindXTypeTexture(sdata);
BindEAMTextures( sdata);// initialize only on first call
// initialize only on first call
sdata->pair.lastgridsize=grid.x*grid.y;
sdata->pair.n_energy_virial=sharedperproc;
MYDBG( printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n",eflag,vflag); ) if(eflag || eflag_atom) sharedperproc = 1;
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation");
PairEAMCuda_Kernel2<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag,eflag_atom,vflag_atom); if(vflag || vflag_atom) sharedperproc = 7;
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed");
cudaThreadSynchronize(); int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed"); dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(eflag||vflag)
{ BindXTypeTexture(sdata);
int n=grid.x*grid.y; BindEAMTextures(sdata); // initialize only on first call
grid.x=sharedperproc; // initialize only on first call
grid.y=1; sdata->pair.lastgridsize = grid.x * grid.y;
threads.x=256; sdata->pair.n_energy_virial = sharedperproc;
MY_AP(PairVirialCompute_reduce)<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)*sharedperproc>>>(n);
cudaThreadSynchronize(); MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);)
CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed"); CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation");
} PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed");
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed");
if(eflag || vflag) {
int n = grid.x * grid.y;
grid.x = sharedperproc;
grid.y = 1;
threads.x = 256;
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)*sharedperproc>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed");
}
MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");)
MYDBG( printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n"); )
} }
void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send) void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send)
{ {
int3 layout=getgrid(n,0); int3 layout = getgrid(n, 0);
dim3 threads(layout.z, 1, 1); dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1); dim3 grid(layout.x, layout.y, 1);
F_FLOAT* buf=(F_FLOAT*) (& ((double*)sdata->buffer)[eam_buff_offset]); F_FLOAT* buf = (F_FLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]);
PairEAMCuda_PackComm_Kernel<<<grid, threads,0>>> ((int*) sdata->comm.sendlist.dev_data,n PairEAMCuda_PackComm_Kernel <<< grid, threads, 0>>> ((int*) sdata->comm.sendlist.dev_data, n
,sdata->comm.maxlistlength,iswap,buf); , sdata->comm.maxlistlength, iswap, buf);
cudaThreadSynchronize(); cudaThreadSynchronize();
cudaMemcpy(buf_send, buf, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); cudaMemcpy(buf_send, buf, n* sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
cudaThreadSynchronize(); cudaThreadSynchronize();
} }
void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,void* fp) void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, void* fp)
{ {
F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]); F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]);
cudaMemcpy(fp_first,buf_recv, n*sizeof(F_FLOAT), cudaMemcpyHostToDevice); cudaMemcpy(fp_first, buf_recv, n * sizeof(F_FLOAT), cudaMemcpyHostToDevice);
} }
#undef _type2frho #undef _type2frho

View File

@ -1,22 +1,22 @@
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version: Original Version:
http://lammps.sandia.gov, Sandia National Laboratories http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory. See the README file in the top-level LAMMPS directory.
----------------------------------------------------------------------- -----------------------------------------------------------------------
USER-CUDA Package and associated modifications: USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/ https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory. See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License. This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
@ -24,116 +24,115 @@
#include <stdio.h> #include <stdio.h>
#include "pair_sw_cuda_cu.h" #include "pair_sw_cuda_cu.h"
__device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR*MANYBODY_NPAIR*MANYBODY_NPAIR]; __device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR];
#include "pair_sw_cuda_kernel_nc.cu" #include "pair_sw_cuda_kernel_nc.cu"
#include <time.h> #include <time.h>
void Cuda_PairSWCuda_Init(cuda_shared_data* sdata,ParamSW_Float* params_host,void* map_host, void* elem2param_host,int nelements_h) void Cuda_PairSWCuda_Init(cuda_shared_data* sdata, ParamSW_Float* params_host, void* map_host, void* elem2param_host, int nelements_h)
{ {
unsigned cuda_ntypes = sdata->atom.ntypes + 1; unsigned cuda_ntypes = sdata->atom.ntypes + 1;
X_FLOAT box_size[3] = X_FLOAT box_size[3] = {
{
sdata->domain.subhi[0] - sdata->domain.sublo[0], sdata->domain.subhi[0] - sdata->domain.sublo[0],
sdata->domain.subhi[1] - sdata->domain.sublo[1], sdata->domain.subhi[1] - sdata->domain.sublo[1],
sdata->domain.subhi[2] - sdata->domain.sublo[2] sdata->domain.subhi[2] - sdata->domain.sublo[2]
}; };
cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3); cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) ); cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned));
cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) ); cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) ); cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 ); cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3);
cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) ); cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
cudaMemcpyToSymbol("params_sw", params_host , sizeof(ParamSW_Float)*nelements_h*nelements_h*nelements_h ); cudaMemcpyToSymbol(params_sw, params_host , sizeof(ParamSW_Float)*nelements_h * nelements_h * nelements_h);
cudaMemcpyToSymbol("elem2param",elem2param_host , sizeof(int)*nelements_h*nelements_h*nelements_h ); cudaMemcpyToSymbol(elem2param, elem2param_host , sizeof(int)*nelements_h * nelements_h * nelements_h);
cudaMemcpyToSymbol("map",map_host , sizeof(int)*cuda_ntypes ); cudaMemcpyToSymbol(map, map_host , sizeof(int)*cuda_ntypes);
cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int)); cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int));
} }
void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
{ {
static int glob_ij_size=0; static int glob_ij_size = 0;
static F_FLOAT4* glob_r_ij=NULL; static F_FLOAT4* glob_r_ij = NULL;
static int* glob_numneigh_red=NULL; static int* glob_numneigh_red = NULL;
static int* glob_neighbors_red=NULL; static int* glob_neighbors_red = NULL;
static int* glob_neightype_red=NULL; static int* glob_neightype_red = NULL;
if(glob_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT)) if(glob_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) {
{ glob_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT);
glob_ij_size = sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT);
cudaFree(glob_r_ij); cudaFree(glob_r_ij);
cudaFree(glob_numneigh_red); cudaFree(glob_numneigh_red);
cudaFree(glob_neighbors_red); cudaFree(glob_neighbors_red);
cudaFree(glob_neightype_red); cudaFree(glob_neightype_red);
cudaMalloc(&glob_r_ij,glob_ij_size*4); cudaMalloc(&glob_r_ij, glob_ij_size * 4);
cudaMalloc(&glob_numneigh_red,sdata->atom.nall*sizeof(int)); cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int));
cudaMalloc(&glob_neighbors_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int)); cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
cudaMalloc(&glob_neightype_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int)); cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red , sizeof(int*) ); cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*));
cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red , sizeof(int*) ); cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*));
cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red , sizeof(int*) ); cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*));
cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij , sizeof(F_FLOAT4*) ); cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*));
} }
dim3 grid,threads;
int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,64); dim3 grid, threads;
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
dim3 grid2; dim3 grid2;
if(sdata->atom.nall<=256*64000){
grid2.x = (sdata->atom.nall+255)/256; if(sdata->atom.nall <= 256 * 64000) {
grid2.y = 1; grid2.x = (sdata->atom.nall + 255) / 256;
} else { grid2.y = 1;
grid2.x = (sdata->atom.nall+256*128-1)/(256*128); } else {
grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128);
grid2.y = 128; grid2.y = 128;
} }
grid2.z = 1;
grid2.z = 1;
dim3 threads2; dim3 threads2;
threads2.x = 256; threads2.x = 256;
threads2.y = 1; threads2.y = 1;
threads2.z = 1; threads2.z = 1;
timespec time1,time2; timespec time1, time2;
//pre-calculate all neighbordistances and zeta_ij //pre-calculate all neighbordistances and zeta_ij
clock_gettime(CLOCK_REALTIME,&time1); clock_gettime(CLOCK_REALTIME, &time1);
Pair_SW_Kernel_TpA_RIJ<<<grid2, threads2,0,streams[1]>>>(); Pair_SW_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>();
cudaThreadSynchronize(); cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2); clock_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.test1+= sdata->cuda_timings.test1 +=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
clock_gettime(CLOCK_REALTIME,&time1); clock_gettime(CLOCK_REALTIME, &time1);
//actual force calculation //actual force calculation
unsigned int sharedsize=(sharedperproc*sizeof(ENERGY_FLOAT)+4*sizeof(F_FLOAT))*threads.x; //extra 4 floats per thread used to reduce register pressure unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
if(eflag)
{ if(eflag) {
if(vflag) if(vflag)
Pair_SW_Kernel_TpA<1,1><<<grid, threads,sharedsize,streams[1]>>> Pair_SW_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom,vflag_atom); (eflag_atom, vflag_atom);
else else
Pair_SW_Kernel_TpA<1,0><<<grid, threads,sharedsize,streams[1]>>> Pair_SW_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom,vflag_atom); (eflag_atom, vflag_atom);
} } else {
else
{
if(vflag) if(vflag)
Pair_SW_Kernel_TpA<0,1><<<grid, threads,sharedsize,streams[1]>>> Pair_SW_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom,vflag_atom); (eflag_atom, vflag_atom);
else else
Pair_SW_Kernel_TpA<0,0><<<grid, threads,sharedsize,streams[1]>>> Pair_SW_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom,vflag_atom); (eflag_atom, vflag_atom);
} }
cudaThreadSynchronize(); cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2); clock_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.test2+= sdata->cuda_timings.test2 +=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
} }

View File

@ -1,22 +1,22 @@
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version: Original Version:
http://lammps.sandia.gov, Sandia National Laboratories http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory. See the README file in the top-level LAMMPS directory.
----------------------------------------------------------------------- -----------------------------------------------------------------------
USER-CUDA Package and associated modifications: USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/ https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory. See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License. This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
@ -25,7 +25,7 @@
#include "pair_tersoff_cuda_cu.h" #include "pair_tersoff_cuda_cu.h"
__device__ __constant__ Param_Float params[MANYBODY_NPAIR*MANYBODY_NPAIR*MANYBODY_NPAIR]; __device__ __constant__ Param_Float params[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR];
__device__ __constant__ F_FLOAT* _glob_zeta_ij; //zeta_ij __device__ __constant__ F_FLOAT* _glob_zeta_ij; //zeta_ij
__device__ __constant__ F_FLOAT4* _glob_r_ij; //r_ij (x,y,z,r^2) for pairs within force cutoff __device__ __constant__ F_FLOAT4* _glob_r_ij; //r_ij (x,y,z,r^2) for pairs within force cutoff
__device__ __constant__ bool _zbl; //is tersoff zbl? __device__ __constant__ bool _zbl; //is tersoff zbl?
@ -36,119 +36,118 @@ __device__ __constant__ bool _zbl; //is tersoff zbl?
#include <time.h> #include <time.h>
void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata,Param_Float* params_host,void* map_host, void* elem2param_host,int nelements_h,bool zbl) void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host, void* map_host, void* elem2param_host, int nelements_h, bool zbl)
{ {
unsigned cuda_ntypes = sdata->atom.ntypes + 1; unsigned cuda_ntypes = sdata->atom.ntypes + 1;
X_FLOAT box_size[3] = X_FLOAT box_size[3] = {
{
sdata->domain.subhi[0] - sdata->domain.sublo[0], sdata->domain.subhi[0] - sdata->domain.sublo[0],
sdata->domain.subhi[1] - sdata->domain.sublo[1], sdata->domain.subhi[1] - sdata->domain.sublo[1],
sdata->domain.subhi[2] - sdata->domain.sublo[2] sdata->domain.subhi[2] - sdata->domain.sublo[2]
}; };
cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3); cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) ); cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned));
cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) ); cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) ); cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 ); cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3);
cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) ); cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
cudaMemcpyToSymbol("params", params_host , sizeof(Param_Float)*nelements_h*nelements_h*nelements_h ); cudaMemcpyToSymbol(params, params_host , sizeof(Param_Float)*nelements_h * nelements_h * nelements_h);
cudaMemcpyToSymbol("elem2param",elem2param_host , sizeof(int)*nelements_h*nelements_h*nelements_h ); cudaMemcpyToSymbol(elem2param, elem2param_host , sizeof(int)*nelements_h * nelements_h * nelements_h);
cudaMemcpyToSymbol("map",map_host , sizeof(int)*cuda_ntypes ); cudaMemcpyToSymbol(map, map_host , sizeof(int)*cuda_ntypes);
cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int)); cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int));
cudaMemcpyToSymbol("_zbl",&zbl,sizeof(bool)); cudaMemcpyToSymbol(_zbl, &zbl, sizeof(bool));
} }
void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
{ {
static F_FLOAT* glob_zeta_ij=NULL; static F_FLOAT* glob_zeta_ij = NULL;
static int glob_zeta_ij_size=0; static int glob_zeta_ij_size = 0;
static F_FLOAT4* glob_r_ij=NULL; static F_FLOAT4* glob_r_ij = NULL;
static int* glob_numneigh_red=NULL; static int* glob_numneigh_red = NULL;
static int* glob_neighbors_red=NULL; static int* glob_neighbors_red = NULL;
static int* glob_neightype_red=NULL; static int* glob_neightype_red = NULL;
if(glob_zeta_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT)) if(glob_zeta_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) {
{ glob_zeta_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT);
glob_zeta_ij_size = sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT); cudaFree(glob_zeta_ij);
cudaFree(glob_zeta_ij);
cudaFree(glob_r_ij); cudaFree(glob_r_ij);
cudaFree(glob_numneigh_red); cudaFree(glob_numneigh_red);
cudaFree(glob_neighbors_red); cudaFree(glob_neighbors_red);
cudaFree(glob_neightype_red); cudaFree(glob_neightype_red);
cudaMalloc(&glob_zeta_ij,glob_zeta_ij_size); cudaMalloc(&glob_zeta_ij, glob_zeta_ij_size);
cudaMalloc(&glob_r_ij,glob_zeta_ij_size*4); cudaMalloc(&glob_r_ij, glob_zeta_ij_size * 4);
cudaMalloc(&glob_numneigh_red,sdata->atom.nall*sizeof(int)); cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int));
cudaMalloc(&glob_neighbors_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int)); cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
cudaMalloc(&glob_neightype_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int)); cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red , sizeof(int*) ); cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*));
cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red , sizeof(int*) ); cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*));
cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red , sizeof(int*) ); cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*));
cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij , sizeof(F_FLOAT4*) ); cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*));
cudaMemcpyToSymbol("_glob_zeta_ij", &glob_zeta_ij , sizeof(F_FLOAT*) ); cudaMemcpyToSymbol(_glob_zeta_ij, &glob_zeta_ij , sizeof(F_FLOAT*));
} }
dim3 grid,threads;
int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,64); dim3 grid, threads;
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
dim3 grid2; dim3 grid2;
if(sdata->atom.nall<=256*64000){
grid2.x = (sdata->atom.nall+255)/256; if(sdata->atom.nall <= 256 * 64000) {
grid2.y = 1; grid2.x = (sdata->atom.nall + 255) / 256;
} else { grid2.y = 1;
grid2.x = (sdata->atom.nall+256*128-1)/(256*128); } else {
grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128);
grid2.y = 128; grid2.y = 128;
} }
grid2.z = 1;
grid2.z = 1;
dim3 threads2; dim3 threads2;
threads2.x = 256; threads2.x = 256;
threads2.y = 1; threads2.y = 1;
threads2.z = 1; threads2.z = 1;
timespec time1,time2; timespec time1, time2;
//pre-calculate all neighbordistances and zeta_ij //pre-calculate all neighbordistances and zeta_ij
clock_gettime(CLOCK_REALTIME,&time1); clock_gettime(CLOCK_REALTIME, &time1);
Pair_Tersoff_Kernel_TpA_RIJ<<<grid2, threads2,0,streams[1]>>> Pair_Tersoff_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>
(); ();
cudaThreadSynchronize(); cudaThreadSynchronize();
Pair_Tersoff_Kernel_TpA_ZetaIJ<<<grid2, threads2,0,streams[1]>>> Pair_Tersoff_Kernel_TpA_ZetaIJ <<< grid2, threads2, 0, streams[1]>>>
(); ();
cudaThreadSynchronize(); cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2); clock_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.test1+= sdata->cuda_timings.test1 +=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
clock_gettime(CLOCK_REALTIME,&time1); clock_gettime(CLOCK_REALTIME, &time1);
//actual force calculation //actual force calculation
unsigned int sharedsize=(sharedperproc*sizeof(ENERGY_FLOAT)+4*sizeof(F_FLOAT))*threads.x; //extra 4 floats per thread used to reduce register pressure unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
if(eflag)
{ if(eflag) {
if(vflag) if(vflag)
Pair_Tersoff_Kernel_TpA<1,1><<<grid, threads,sharedsize,streams[1]>>> Pair_Tersoff_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom,vflag_atom); (eflag_atom, vflag_atom);
else else
Pair_Tersoff_Kernel_TpA<1,0><<<grid, threads,sharedsize,streams[1]>>> Pair_Tersoff_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom,vflag_atom); (eflag_atom, vflag_atom);
} } else {
else
{
if(vflag) if(vflag)
Pair_Tersoff_Kernel_TpA<0,1><<<grid, threads,sharedsize,streams[1]>>> Pair_Tersoff_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom,vflag_atom); (eflag_atom, vflag_atom);
else else
Pair_Tersoff_Kernel_TpA<0,0><<<grid, threads,sharedsize,streams[1]>>> Pair_Tersoff_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom,vflag_atom); (eflag_atom, vflag_atom);
} }
cudaThreadSynchronize(); cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2); clock_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.test2+= sdata->cuda_timings.test2 +=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
} }

View File

@ -1,22 +1,22 @@
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version: Original Version:
http://lammps.sandia.gov, Sandia National Laboratories http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory. See the README file in the top-level LAMMPS directory.
----------------------------------------------------------------------- -----------------------------------------------------------------------
USER-CUDA Package and associated modifications: USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/ https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory. See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License. This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
@ -34,484 +34,493 @@
#define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b))
__device__ __constant__ FFT_FLOAT* work1; __device__ __constant__ FFT_FLOAT* work1;
__device__ __constant__ FFT_FLOAT* work2; __device__ __constant__ FFT_FLOAT* work2;
__device__ __constant__ FFT_FLOAT* work3; __device__ __constant__ FFT_FLOAT* work3;
__device__ __constant__ PPPM_FLOAT* greensfn; __device__ __constant__ PPPM_FLOAT* greensfn;
__device__ __constant__ PPPM_FLOAT* gf_b; __device__ __constant__ PPPM_FLOAT* gf_b;
__device__ __constant__ PPPM_FLOAT* fkx; __device__ __constant__ PPPM_FLOAT* fkx;
__device__ __constant__ PPPM_FLOAT* fky; __device__ __constant__ PPPM_FLOAT* fky;
__device__ __constant__ PPPM_FLOAT* fkz; __device__ __constant__ PPPM_FLOAT* fkz;
__device__ __constant__ PPPM_FLOAT* vg; __device__ __constant__ PPPM_FLOAT* vg;
__device__ __constant__ int* part2grid; __device__ __constant__ int* part2grid;
__device__ __constant__ PPPM_FLOAT* density_brick; __device__ __constant__ PPPM_FLOAT* density_brick;
__device__ __constant__ int* density_brick_int; __device__ __constant__ int* density_brick_int;
__device__ __constant__ PPPM_FLOAT density_intScale; __device__ __constant__ PPPM_FLOAT density_intScale;
__device__ __constant__ PPPM_FLOAT* vdx_brick; __device__ __constant__ PPPM_FLOAT* vdx_brick;
__device__ __constant__ PPPM_FLOAT* vdy_brick; __device__ __constant__ PPPM_FLOAT* vdy_brick;
__device__ __constant__ PPPM_FLOAT* vdz_brick; __device__ __constant__ PPPM_FLOAT* vdz_brick;
__device__ __constant__ PPPM_FLOAT* density_fft; __device__ __constant__ PPPM_FLOAT* density_fft;
__device__ __constant__ ENERGY_FLOAT* energy; __device__ __constant__ ENERGY_FLOAT* energy;
__device__ __constant__ ENERGY_FLOAT* virial; __device__ __constant__ ENERGY_FLOAT* virial;
__device__ __constant__ int nxlo_in; __device__ __constant__ int nxlo_in;
__device__ __constant__ int nxhi_in; __device__ __constant__ int nxhi_in;
__device__ __constant__ int nxlo_out; __device__ __constant__ int nxlo_out;
__device__ __constant__ int nxhi_out; __device__ __constant__ int nxhi_out;
__device__ __constant__ int nylo_in; __device__ __constant__ int nylo_in;
__device__ __constant__ int nyhi_in; __device__ __constant__ int nyhi_in;
__device__ __constant__ int nylo_out; __device__ __constant__ int nylo_out;
__device__ __constant__ int nyhi_out; __device__ __constant__ int nyhi_out;
__device__ __constant__ int nzlo_in; __device__ __constant__ int nzlo_in;
__device__ __constant__ int nzhi_in; __device__ __constant__ int nzhi_in;
__device__ __constant__ int nzlo_out; __device__ __constant__ int nzlo_out;
__device__ __constant__ int nzhi_out; __device__ __constant__ int nzhi_out;
__device__ __constant__ int nxlo_fft; __device__ __constant__ int nxlo_fft;
__device__ __constant__ int nxhi_fft; __device__ __constant__ int nxhi_fft;
__device__ __constant__ int nylo_fft; __device__ __constant__ int nylo_fft;
__device__ __constant__ int nyhi_fft; __device__ __constant__ int nyhi_fft;
__device__ __constant__ int nzlo_fft; __device__ __constant__ int nzlo_fft;
__device__ __constant__ int nzhi_fft; __device__ __constant__ int nzhi_fft;
__device__ __constant__ int nx_pppm; __device__ __constant__ int nx_pppm;
__device__ __constant__ int ny_pppm; __device__ __constant__ int ny_pppm;
__device__ __constant__ int nz_pppm; __device__ __constant__ int nz_pppm;
__device__ __constant__ int slabflag; __device__ __constant__ int slabflag;
__device__ __constant__ PPPM_FLOAT qqrd2e; __device__ __constant__ PPPM_FLOAT qqrd2e;
__device__ __constant__ int order; __device__ __constant__ int order;
//__device__ __constant__ float3 sublo; //__device__ __constant__ float3 sublo;
__device__ __constant__ PPPM_FLOAT* rho_coeff; __device__ __constant__ PPPM_FLOAT* rho_coeff;
__device__ __constant__ int nmax; __device__ __constant__ int nmax;
__device__ __constant__ int nlocal; __device__ __constant__ int nlocal;
__device__ __constant__ PPPM_FLOAT* debugdata; __device__ __constant__ PPPM_FLOAT* debugdata;
__device__ __constant__ PPPM_FLOAT delxinv; __device__ __constant__ PPPM_FLOAT delxinv;
__device__ __constant__ PPPM_FLOAT delyinv; __device__ __constant__ PPPM_FLOAT delyinv;
__device__ __constant__ PPPM_FLOAT delzinv; __device__ __constant__ PPPM_FLOAT delzinv;
__device__ __constant__ int nlower; __device__ __constant__ int nlower;
__device__ __constant__ int nupper; __device__ __constant__ int nupper;
__device__ __constant__ PPPM_FLOAT shiftone; __device__ __constant__ PPPM_FLOAT shiftone;
#include "pppm_cuda_kernel.cu" #include "pppm_cuda_kernel.cu"
#include "stdio.h" #include "stdio.h"
void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial
,void* cu_work1,void* cu_work2, void* cu_work3,void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg , void* cu_work1, void* cu_work2, void* cu_work3, void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg
,int cu_nxlo_in,int cu_nxhi_in,int cu_nylo_in,int cu_nyhi_in,int cu_nzlo_in,int cu_nzhi_in,int cu_nxlo_out,int cu_nxhi_out,int cu_nylo_out,int cu_nyhi_out,int cu_nzlo_out,int cu_nzhi_out,int cu_nx_pppm,int cu_ny_pppm,int cu_nz_pppm , int cu_nxlo_in, int cu_nxhi_in, int cu_nylo_in, int cu_nyhi_in, int cu_nzlo_in, int cu_nzhi_in, int cu_nxlo_out, int cu_nxhi_out, int cu_nylo_out, int cu_nyhi_out, int cu_nzlo_out, int cu_nzhi_out, int cu_nx_pppm, int cu_ny_pppm, int cu_nz_pppm
,int cu_nxlo_fft,int cu_nxhi_fft,int cu_nylo_fft,int cu_nyhi_fft,int cu_nzlo_fft,int cu_nzhi_fft,void* cu_gf_b , int cu_nxlo_fft, int cu_nxhi_fft, int cu_nylo_fft, int cu_nyhi_fft, int cu_nzlo_fft, int cu_nzhi_fft, void* cu_gf_b
,double cu_qqrd2e, int cu_order, void* cu_rho_coeff,void* cu_debugdata,void* cu_density_brick_int,int cu_slabflag , double cu_qqrd2e, int cu_order, void* cu_rho_coeff, void* cu_debugdata, void* cu_density_brick_int, int cu_slabflag
) )
{ {
CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start"); CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start");
cudaMemcpyToSymbol("density_brick",&cu_density_brick, sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol(density_brick, &cu_density_brick, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("density_brick_int",&cu_density_brick_int, sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol(density_brick_int, &cu_density_brick_int, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("vdx_brick",&cu_vdx_brick, sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol(vdx_brick, &cu_vdx_brick, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("vdy_brick",&cu_vdy_brick, sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol(vdy_brick, &cu_vdy_brick, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("vdz_brick",&cu_vdz_brick, sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol(vdz_brick, &cu_vdz_brick, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("density_fft",&cu_density_fft, sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol(density_fft, &cu_density_fft, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("energy",&cu_energy, sizeof(ENERGY_FLOAT*)); cudaMemcpyToSymbol(energy, &cu_energy, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol("virial",&cu_virial, sizeof(ENERGY_FLOAT*)); cudaMemcpyToSymbol(virial, &cu_virial, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol("nxlo_in",&cu_nxlo_in, sizeof(int)); cudaMemcpyToSymbol(nxlo_in, &cu_nxlo_in, sizeof(int));
cudaMemcpyToSymbol("nxhi_in",&cu_nxhi_in, sizeof(int)); cudaMemcpyToSymbol(nxhi_in, &cu_nxhi_in, sizeof(int));
cudaMemcpyToSymbol("nxlo_out",&cu_nxlo_out, sizeof(int)); cudaMemcpyToSymbol(nxlo_out, &cu_nxlo_out, sizeof(int));
cudaMemcpyToSymbol("nxhi_out",&cu_nxhi_out, sizeof(int)); cudaMemcpyToSymbol(nxhi_out, &cu_nxhi_out, sizeof(int));
cudaMemcpyToSymbol("nylo_in",&cu_nylo_in, sizeof(int)); cudaMemcpyToSymbol(nylo_in, &cu_nylo_in, sizeof(int));
cudaMemcpyToSymbol("nyhi_in",&cu_nyhi_in, sizeof(int)); cudaMemcpyToSymbol(nyhi_in, &cu_nyhi_in, sizeof(int));
cudaMemcpyToSymbol("nylo_out",&cu_nylo_out, sizeof(int)); cudaMemcpyToSymbol(nylo_out, &cu_nylo_out, sizeof(int));
cudaMemcpyToSymbol("nyhi_out",&cu_nyhi_out, sizeof(int)); cudaMemcpyToSymbol(nyhi_out, &cu_nyhi_out, sizeof(int));
cudaMemcpyToSymbol("nzlo_in",&cu_nzlo_in, sizeof(int)); cudaMemcpyToSymbol(nzlo_in, &cu_nzlo_in, sizeof(int));
cudaMemcpyToSymbol("nzhi_in",&cu_nzhi_in, sizeof(int)); cudaMemcpyToSymbol(nzhi_in, &cu_nzhi_in, sizeof(int));
cudaMemcpyToSymbol("nzlo_out",&cu_nzlo_out, sizeof(int)); cudaMemcpyToSymbol(nzlo_out, &cu_nzlo_out, sizeof(int));
cudaMemcpyToSymbol("nzhi_out",&cu_nzhi_out, sizeof(int)); cudaMemcpyToSymbol(nzhi_out, &cu_nzhi_out, sizeof(int));
cudaMemcpyToSymbol("nxlo_fft",&cu_nxlo_fft, sizeof(int)); cudaMemcpyToSymbol(nxlo_fft, &cu_nxlo_fft, sizeof(int));
cudaMemcpyToSymbol("nxhi_fft",&cu_nxhi_fft, sizeof(int)); cudaMemcpyToSymbol(nxhi_fft, &cu_nxhi_fft, sizeof(int));
cudaMemcpyToSymbol("nylo_fft",&cu_nylo_fft, sizeof(int)); cudaMemcpyToSymbol(nylo_fft, &cu_nylo_fft, sizeof(int));
cudaMemcpyToSymbol("nyhi_fft",&cu_nyhi_fft, sizeof(int)); cudaMemcpyToSymbol(nyhi_fft, &cu_nyhi_fft, sizeof(int));
cudaMemcpyToSymbol("nzlo_fft",&cu_nzlo_fft, sizeof(int)); cudaMemcpyToSymbol(nzlo_fft, &cu_nzlo_fft, sizeof(int));
cudaMemcpyToSymbol("nzhi_fft",&cu_nzhi_fft, sizeof(int)); cudaMemcpyToSymbol(nzhi_fft, &cu_nzhi_fft, sizeof(int));
cudaMemcpyToSymbol("slabflag",&cu_slabflag, sizeof(int)); cudaMemcpyToSymbol(slabflag, &cu_slabflag, sizeof(int));
cudaMemcpyToSymbol("nx_pppm",&cu_nx_pppm, sizeof(int)); cudaMemcpyToSymbol(nx_pppm, &cu_nx_pppm, sizeof(int));
cudaMemcpyToSymbol("ny_pppm",&cu_ny_pppm, sizeof(int)); cudaMemcpyToSymbol(ny_pppm, &cu_ny_pppm, sizeof(int));
cudaMemcpyToSymbol("nz_pppm",&cu_nz_pppm, sizeof(int)); cudaMemcpyToSymbol(nz_pppm, &cu_nz_pppm, sizeof(int));
cudaMemcpyToSymbol("work1",&cu_work1, sizeof(FFT_FLOAT*)); cudaMemcpyToSymbol(work1, &cu_work1, sizeof(FFT_FLOAT*));
cudaMemcpyToSymbol("work2",&cu_work2, sizeof(FFT_FLOAT*)); cudaMemcpyToSymbol(work2, &cu_work2, sizeof(FFT_FLOAT*));
cudaMemcpyToSymbol("work3",&cu_work3, sizeof(FFT_FLOAT*)); cudaMemcpyToSymbol(work3, &cu_work3, sizeof(FFT_FLOAT*));
cudaMemcpyToSymbol("greensfn",&cu_greensfn, sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol(greensfn, &cu_greensfn, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("gf_b",&cu_gf_b, sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol(gf_b, &cu_gf_b, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("fkx",&cu_fkx, sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol(fkx, &cu_fkx, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("fky",&cu_fky, sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol(fky, &cu_fky, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("fkz",&cu_fkz, sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol(fkz, &cu_fkz, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("vg",&cu_vg, sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol(vg, &cu_vg, sizeof(PPPM_FLOAT*));
PPPM_FLOAT cu_qqrd2e_a = cu_qqrd2e;
cudaMemcpyToSymbol(qqrd2e, &cu_qqrd2e_a, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol(order, &cu_order, sizeof(int));
cudaMemcpyToSymbol(rho_coeff, &cu_rho_coeff, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(debugdata, &cu_debugdata, sizeof(PPPM_FLOAT*));
PPPM_FLOAT cu_qqrd2e_a=cu_qqrd2e;
cudaMemcpyToSymbol("qqrd2e",&cu_qqrd2e_a, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol("order",&cu_order, sizeof(int));
cudaMemcpyToSymbol("rho_coeff",&cu_rho_coeff, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("debugdata",&cu_debugdata, sizeof(PPPM_FLOAT*));
CUT_CHECK_ERROR("ERROR-CUDA poisson_init"); CUT_CHECK_ERROR("ERROR-CUDA poisson_init");
/*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n"); /*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n");
#ifdef PPPM_PRECISION #ifdef PPPM_PRECISION
if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n"); if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n");
if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n"); if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n");
#endif #endif
#ifdef ENERGY_PRECISION #ifdef ENERGY_PRECISION
if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n"); if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n");
if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n"); if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n");
#endif #endif
#ifdef ENERGY_PRECISION #ifdef ENERGY_PRECISION
if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n"); if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n");
if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n"); if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n");
#endif #endif
#ifdef X_PRECISION #ifdef X_PRECISION
if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n"); if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n");
if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n"); if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n");
#endif #endif
#ifdef F_PRECISION #ifdef F_PRECISION
if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n"); if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n");
if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n"); if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n");
#endif*/ #endif*/
} }
void pppm_device_init_setup(cuda_shared_data* sdata,PPPM_FLOAT cu_shiftone,PPPM_FLOAT cu_delxinv,PPPM_FLOAT cu_delyinv,PPPM_FLOAT cu_delzinv,int cu_nlower,int cu_nupper) void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_FLOAT cu_shiftone, PPPM_FLOAT cu_delxinv, PPPM_FLOAT cu_delyinv, PPPM_FLOAT cu_delzinv, int cu_nlower, int cu_nupper)
{ {
cudaMemcpyToSymbol("delxinv",&cu_delxinv, sizeof(PPPM_FLOAT)); cudaMemcpyToSymbol(delxinv, &cu_delxinv, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol("delyinv",&cu_delyinv, sizeof(PPPM_FLOAT)); cudaMemcpyToSymbol(delyinv, &cu_delyinv, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol("delzinv",&cu_delzinv, sizeof(PPPM_FLOAT)); cudaMemcpyToSymbol(delzinv, &cu_delzinv, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol("shiftone",&cu_shiftone, sizeof(PPPM_FLOAT)); cudaMemcpyToSymbol(shiftone, &cu_shiftone, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol("nlower",&cu_nlower, sizeof(int)); cudaMemcpyToSymbol(nlower, &cu_nlower, sizeof(int));
cudaMemcpyToSymbol("nupper",&cu_nupper, sizeof(int)); cudaMemcpyToSymbol(nupper, &cu_nupper, sizeof(int));
cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo, 3*sizeof(X_FLOAT)); cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo, 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi, 3*sizeof(X_FLOAT)); cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi, 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(boxlo) , sdata->domain.boxlo, 3*sizeof(X_FLOAT)); cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo, 3 * sizeof(X_FLOAT));
CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup"); CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup");
} }
void pppm_device_update(cuda_shared_data* sdata,void* cu_part2grid, int nlocala,int nmaxa) void pppm_device_update(cuda_shared_data* sdata, void* cu_part2grid, int nlocala, int nmaxa)
{ {
cudaMemcpyToSymbol("part2grid",&cu_part2grid, sizeof(int*)); cudaMemcpyToSymbol("part2grid", &cu_part2grid, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
//cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int)); //cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int));
cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int)); cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int));
cudaMemcpyToSymbol("nmax" , &nmaxa, sizeof(int)); cudaMemcpyToSymbol(nmax , &nmaxa, sizeof(int));
CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update"); CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update");
} }
void pppm_update_nlocal(int nlocala) void pppm_update_nlocal(int nlocala)
{ {
cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int)); cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int));
CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b"); CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b");
} }
void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald) void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald)
{ {
dim3 grid; dim3 grid;
dim3 threads; dim3 threads;
grid.x=nz_pppma; grid.x = nz_pppma;
grid.y=ny_pppma; grid.y = ny_pppma;
grid.z=1; grid.z = 1;
threads.x=nx_pppma; threads.x = nx_pppma;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
setup_fkxyz_vg<<<grid,threads,0>>>(unitkx,unitky,unitkz,g_ewald); setup_fkxyz_vg <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald);
cudaThreadSynchronize(); cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg "); CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg ");
} }
void Cuda_PPPM_setup_greensfn(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald, void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald,
int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab) int nbx, int nby, int nbz, PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab)
{ {
dim3 grid; dim3 grid;
dim3 threads; dim3 threads;
grid.x=nz_pppma; grid.x = nz_pppma;
grid.y=ny_pppma; grid.y = ny_pppma;
grid.z=1; grid.z = 1;
threads.x=nx_pppma; threads.x = nx_pppma;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
setup_greensfn<<<grid,threads,0>>>(unitkx,unitky,unitkz,g_ewald,nbx,nby,nbz,xprd,yprd, zprd_slab); setup_greensfn <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald, nbx, nby, nbz, xprd, yprd, zprd_slab);
cudaThreadSynchronize(); cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn "); CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn ");
} }
void poisson_scale(int nx_pppma,int ny_pppma,int nz_pppma) void poisson_scale(int nx_pppma, int ny_pppma, int nz_pppma)
{ {
dim3 grid; dim3 grid;
dim3 threads; dim3 threads;
grid.x=nz_pppma; grid.x = nz_pppma;
grid.y=ny_pppma; grid.y = ny_pppma;
grid.z=1; grid.z = 1;
threads.x=nx_pppma; threads.x = nx_pppma;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
poisson_scale_kernel<<<grid,threads,0>>>(); poisson_scale_kernel <<< grid, threads, 0>>>();
CUT_CHECK_ERROR("ERROR-CUDA poisson_scale "); CUT_CHECK_ERROR("ERROR-CUDA poisson_scale ");
} }
void poisson_xgrad(int nx_pppma,int ny_pppma,int nz_pppma) void poisson_xgrad(int nx_pppma, int ny_pppma, int nz_pppma)
{ {
dim3 grid; dim3 grid;
dim3 threads; dim3 threads;
grid.x=nz_pppma; grid.x = nz_pppma;
grid.y=ny_pppma; grid.y = ny_pppma;
grid.z=1; grid.z = 1;
threads.x=nx_pppma; threads.x = nx_pppma;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
poisson_xgrad_kernel<<<grid,threads,0>>>(); poisson_xgrad_kernel <<< grid, threads, 0>>>();
CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad "); CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad ");
} }
void poisson_ygrad(int nx_pppma,int ny_pppma,int nz_pppma) void poisson_ygrad(int nx_pppma, int ny_pppma, int nz_pppma)
{ {
dim3 grid; dim3 grid;
dim3 threads; dim3 threads;
grid.x=nz_pppma; grid.x = nz_pppma;
grid.y=ny_pppma; grid.y = ny_pppma;
grid.z=1; grid.z = 1;
threads.x=nx_pppma; threads.x = nx_pppma;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
poisson_ygrad_kernel<<<grid,threads,0>>>(); poisson_ygrad_kernel <<< grid, threads, 0>>>();
CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad "); CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad ");
} }
void poisson_zgrad(int nx_pppma,int ny_pppma,int nz_pppma) void poisson_zgrad(int nx_pppma, int ny_pppma, int nz_pppma)
{ {
dim3 grid; dim3 grid;
dim3 threads; dim3 threads;
grid.x=nz_pppma; grid.x = nz_pppma;
grid.y=ny_pppma; grid.y = ny_pppma;
grid.z=1; grid.z = 1;
threads.x=nx_pppma; threads.x = nx_pppma;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
poisson_zgrad_kernel<<<grid,threads,0>>>(); poisson_zgrad_kernel <<< grid, threads, 0>>>();
CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad "); CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad ");
} }
void poisson_vdx_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppma,int ny_pppma,int nz_pppma) void poisson_vdx_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppma, int ny_pppma, int nz_pppma)
{ {
dim3 grid; dim3 grid;
dim3 threads; dim3 threads;
grid.x=khi-klo+1; grid.x = khi - klo + 1;
grid.y=jhi-jlo+1; grid.y = jhi - jlo + 1;
grid.z=1; grid.z = 1;
threads.x=ihi-ilo+1; threads.x = ihi - ilo + 1;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
//printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x); //printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x);
poisson_vdx_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo); poisson_vdx_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo);
CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick "); CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick ");
cudaThreadSynchronize(); cudaThreadSynchronize();
} }
void poisson_vdy_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm) void poisson_vdy_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm)
{ {
dim3 grid; dim3 grid;
dim3 threads; dim3 threads;
grid.x=khi-klo+1; grid.x = khi - klo + 1;
grid.y=jhi-jlo+1; grid.y = jhi - jlo + 1;
grid.z=1; grid.z = 1;
threads.x=ihi-ilo+1; threads.x = ihi - ilo + 1;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
poisson_vdy_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo); poisson_vdy_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo);
CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick "); CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick ");
cudaThreadSynchronize(); cudaThreadSynchronize();
} }
void poisson_vdz_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm) void poisson_vdz_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm)
{ {
dim3 grid; dim3 grid;
dim3 threads; dim3 threads;
grid.x=khi-klo+1; grid.x = khi - klo + 1;
grid.y=jhi-jlo+1; grid.y = jhi - jlo + 1;
grid.z=1; grid.z = 1;
threads.x=ihi-ilo+1; threads.x = ihi - ilo + 1;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
poisson_vdz_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo); poisson_vdz_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo);
CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick "); CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick ");
cudaThreadSynchronize(); cudaThreadSynchronize();
} }
void poisson_energy(int nxlo_fft,int nxhi_fft,int nylo_fft,int nyhi_fft,int nzlo_fft,int nzhi_fft,int vflag) void poisson_energy(int nxlo_fft, int nxhi_fft, int nylo_fft, int nyhi_fft, int nzlo_fft, int nzhi_fft, int vflag)
{ {
//printf("VFLAG_GPU: %i\n",vflag); //printf("VFLAG_GPU: %i\n",vflag);
CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start "); CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start ");
dim3 grid; dim3 grid;
dim3 threads; dim3 threads;
grid.x=nzhi_fft-nzlo_fft+1; grid.x = nzhi_fft - nzlo_fft + 1;
grid.y=nyhi_fft-nylo_fft+1; grid.y = nyhi_fft - nylo_fft + 1;
grid.z=1; grid.z = 1;
threads.x=nxhi_fft-nxlo_fft+1; threads.x = nxhi_fft - nxlo_fft + 1;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
poisson_energy_kernel<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(nxlo_fft,nylo_fft,nzlo_fft,vflag); poisson_energy_kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(nxlo_fft, nylo_fft, nzlo_fft, vflag);
cudaThreadSynchronize(); cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end "); CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end ");
} }
ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppma,int nz_pppma,int vflag,ENERGY_FLOAT* cpu_virial) ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_FLOAT* cpu_virial)
{ {
ENERGY_FLOAT host_energy=0; ENERGY_FLOAT host_energy = 0;
dim3 grid; dim3 grid;
dim3 threads; dim3 threads;
grid.x=nz_pppma; grid.x = nz_pppma;
grid.y=1; grid.y = 1;
grid.z=1; grid.z = 1;
threads.x=ny_pppma; threads.x = ny_pppma;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
sum_energy_kernel1<<<grid,threads,ny_pppma*sizeof(ENERGY_FLOAT)>>>(vflag); sum_energy_kernel1 <<< grid, threads, ny_pppma* sizeof(ENERGY_FLOAT)>>>(vflag);
cudaThreadSynchronize(); cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 "); CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 ");
grid.x=1; grid.x = 1;
grid.y=1; grid.y = 1;
grid.z=1; grid.z = 1;
threads.x=nz_pppma; threads.x = nz_pppma;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
sum_energy_kernel2<<<grid,threads,nz_pppma*sizeof(ENERGY_FLOAT)>>>(vflag); sum_energy_kernel2 <<< grid, threads, nz_pppma* sizeof(ENERGY_FLOAT)>>>(vflag);
cudaThreadSynchronize(); cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 "); CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 ");
cudaMemcpy((void*) (&host_energy), cu_energy, sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); cudaMemcpy((void*)(&host_energy), cu_energy, sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
if(vflag) if(vflag)
cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6 * sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy"); CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy");
return host_energy; return host_energy;
} }
void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_intScale,int ihi,int ilo,int jhi,int jlo,int khi,int klo,void* cu_density_brick,void* cu_density_brick_int) void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_FLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int)
{ {
CUT_CHECK_ERROR("cuda_make_rho begin"); CUT_CHECK_ERROR("cuda_make_rho begin");
dim3 grid,threads; dim3 grid, threads;
int cpu_flag[3]; int cpu_flag[3];
grid.x=(sdata->atom.nlocal+31)/32; grid.x = (sdata->atom.nlocal + 31) / 32;
grid.y=1; grid.y = 1;
grid.z=1; grid.z = 1;
threads.x=32; threads.x = 32;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
int sharedmemsize=(32+32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT); int sharedmemsize = (32 + 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT);
do
{ do {
cpu_flag[0]=0; cpu_flag[0] = 0;
cpu_flag[1]=0; cpu_flag[1] = 0;
cpu_flag[2]=0; cpu_flag[2] = 0;
cudaMemcpyToSymbol("density_intScale",cu_density_intScale,sizeof(PPPM_FLOAT*)); cudaMemcpyToSymbol("density_intScale", cu_density_intScale, sizeof(PPPM_FLOAT*));
CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z"); CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z");
cudaMemset(flag,0,3*sizeof(int)); cudaMemset(flag, 0, 3 * sizeof(int));
CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A"); CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A");
cudaMemset(cu_density_brick,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(PPPM_FLOAT)); cudaMemset(cu_density_brick, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(PPPM_FLOAT));
CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B"); CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B");
cudaMemset(cu_density_brick_int,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(int)); cudaMemset(cu_density_brick_int, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(int));
CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C"); CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C");
make_rho_kernel<<<grid,threads,sharedmemsize>>>((int*) flag,32/(sdata->pppm.nupper-sdata->pppm.nlower+1)); make_rho_kernel <<< grid, threads, sharedmemsize>>>((int*) flag, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1));
cudaThreadSynchronize(); cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA make_rho A"); CUT_CHECK_ERROR("ERROR-CUDA make_rho A");
cudaMemcpy((void*) &cpu_flag, flag, 3*sizeof(int),cudaMemcpyDeviceToHost); cudaMemcpy((void*) &cpu_flag, flag, 3 * sizeof(int), cudaMemcpyDeviceToHost);
if(cpu_flag[0]!=0) {(*cu_density_intScale)/=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n",*cu_density_intScale);)}
if((cpu_flag[0]==0)&&(cpu_flag[1]==0)) {(*cu_density_intScale)*=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n",*cu_density_intScale);)} if(cpu_flag[0] != 0) {
/* if((*cu_density_intScale)>0xe0000000) (*cu_density_intScale) /= 2;
{ MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n", *cu_density_intScale);)
printf("Error Scaling\n"); }
cpu_flag[0]=0; if((cpu_flag[0] == 0) && (cpu_flag[1] == 0)) {
cpu_flag[1]=1; (*cu_density_intScale) *= 2;
}*/ MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n", *cu_density_intScale);)
}
/* if((*cu_density_intScale)>0xe0000000)
{
printf("Error Scaling\n");
cpu_flag[0]=0;
cpu_flag[1]=1;
}*/
CUT_CHECK_ERROR("ERROR-CUDA make_rho B"); CUT_CHECK_ERROR("ERROR-CUDA make_rho B");
} while((cpu_flag[0]!=0)||(cpu_flag[1]==0)); } while((cpu_flag[0] != 0) || (cpu_flag[1] == 0));
grid.x=khi-klo+1; grid.x = khi - klo + 1;
grid.y=jhi-jlo+1; grid.y = jhi - jlo + 1;
threads.x=ihi-ilo+1; threads.x = ihi - ilo + 1;
scale_rho_kernel<<<grid,threads,0>>>(); scale_rho_kernel <<< grid, threads, 0>>>();
cudaThreadSynchronize(); cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale"); CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale");
} }
int cuda_particle_map(cuda_shared_data* sdata,void* flag) int cuda_particle_map(cuda_shared_data* sdata, void* flag)
{ {
dim3 grid,threads; dim3 grid, threads;
int cpu_flag; int cpu_flag;
grid.x=(sdata->atom.nlocal+31)/32; grid.x = (sdata->atom.nlocal + 31) / 32;
grid.y=1; grid.y = 1;
grid.z=1; grid.z = 1;
threads.x=32; threads.x = 32;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre"); CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre");
particle_map_kernel<<<grid,threads,0>>>((int*) flag); particle_map_kernel <<< grid, threads, 0>>>((int*) flag);
cudaThreadSynchronize(); cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA particla_map a"); CUT_CHECK_ERROR("ERROR-CUDA particla_map a");
cudaMemcpy((void*) &cpu_flag, flag, sizeof(int),cudaMemcpyDeviceToHost); cudaMemcpy((void*) &cpu_flag, flag, sizeof(int), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("ERROR-CUDA particla_map b"); CUT_CHECK_ERROR("ERROR-CUDA particla_map b");
return cpu_flag; return cpu_flag;
} }
void cuda_fieldforce(cuda_shared_data* sdata,void* flag) void cuda_fieldforce(cuda_shared_data* sdata, void* flag)
{ {
dim3 grid,threads; dim3 grid, threads;
grid.x=(sdata->atom.nlocal+31)/32; grid.x = (sdata->atom.nlocal + 31) / 32;
grid.y=1; grid.y = 1;
grid.z=1; grid.z = 1;
threads.x=32; threads.x = 32;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
int sharedmemsize=(32+3*32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT); int sharedmemsize = (32 + 3 * 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT);
fieldforce_kernel<<<grid,threads,sharedmemsize>>> fieldforce_kernel <<< grid, threads, sharedmemsize>>>
(sdata->pppm.nupper-sdata->pppm.nlower+1,32/(sdata->pppm.nupper-sdata->pppm.nlower+1),(int*) flag); (sdata->pppm.nupper - sdata->pppm.nlower + 1, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1), (int*) flag);
cudaThreadSynchronize(); cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA fieldforce"); CUT_CHECK_ERROR("ERROR-CUDA fieldforce");
} }
double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf) double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf)
{ {
dim3 grid,threads; dim3 grid, threads;
grid.x=(sdata->atom.nlocal+31)/32; grid.x = (sdata->atom.nlocal + 31) / 32;
grid.y=1; grid.y = 1;
grid.z=1; grid.z = 1;
threads.x=32; threads.x = 32;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
slabcorr_energy_kernel<<<grid,threads,32*sizeof(ENERGY_FLOAT)>>>(dev_buf); slabcorr_energy_kernel <<< grid, threads, 32* sizeof(ENERGY_FLOAT)>>>(dev_buf);
cudaThreadSynchronize(); cudaThreadSynchronize();
cudaMemcpy((void*) buf, dev_buf, grid.x*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); cudaMemcpy((void*) buf, dev_buf, grid.x* sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
double dipole_all=0.0; double dipole_all = 0.0;
for(int i=0;i<grid.x;i++)
dipole_all+=buf[i]; for(int i = 0; i < grid.x; i++)
dipole_all += buf[i];
return dipole_all; return dipole_all;
} }
void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact) void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact)
{ {
dim3 grid,threads; dim3 grid, threads;
grid.x=(sdata->atom.nlocal+31)/32; grid.x = (sdata->atom.nlocal + 31) / 32;
grid.y=1; grid.y = 1;
grid.z=1; grid.z = 1;
threads.x=32; threads.x = 32;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
slabcorr_force_kernel<<<grid,threads>>>(ffact); slabcorr_force_kernel <<< grid, threads>>>(ffact);
cudaThreadSynchronize(); cudaThreadSynchronize();
} }
@ -519,59 +528,59 @@ void sum_virial(double* host_virial)
{ {
} }
void pppm_initfftdata(cuda_shared_data* sdata,PPPM_FLOAT* in,FFT_FLOAT* out) void pppm_initfftdata(cuda_shared_data* sdata, PPPM_FLOAT* in, FFT_FLOAT* out)
{ {
int nslow=sdata->pppm.nzhi_in-sdata->pppm.nzlo_in; int nslow = sdata->pppm.nzhi_in - sdata->pppm.nzlo_in;
int nmid=sdata->pppm.nyhi_in-sdata->pppm.nylo_in; int nmid = sdata->pppm.nyhi_in - sdata->pppm.nylo_in;
int nfast=sdata->pppm.nxhi_in-sdata->pppm.nxlo_in; int nfast = sdata->pppm.nxhi_in - sdata->pppm.nxlo_in;
int nrimz=MAX(sdata->pppm.nzlo_in-sdata->pppm.nzlo_out,sdata->pppm.nzhi_out-sdata->pppm.nzhi_in); int nrimz = MAX(sdata->pppm.nzlo_in - sdata->pppm.nzlo_out, sdata->pppm.nzhi_out - sdata->pppm.nzhi_in);
int nrimy=MAX(sdata->pppm.nylo_in-sdata->pppm.nylo_out,sdata->pppm.nyhi_out-sdata->pppm.nyhi_in); int nrimy = MAX(sdata->pppm.nylo_in - sdata->pppm.nylo_out, sdata->pppm.nyhi_out - sdata->pppm.nyhi_in);
int nrimx=MAX(sdata->pppm.nxlo_in-sdata->pppm.nxlo_out,sdata->pppm.nxhi_out-sdata->pppm.nxhi_in); int nrimx = MAX(sdata->pppm.nxlo_in - sdata->pppm.nxlo_out, sdata->pppm.nxhi_out - sdata->pppm.nxhi_in);
dim3 grid; dim3 grid;
grid.x=nslow+1; grid.x = nslow + 1;
grid.y=nmid+1; grid.y = nmid + 1;
grid.z=1; grid.z = 1;
dim3 threads; dim3 threads;
threads.x=nfast+1; threads.x = nfast + 1;
threads.y=1; threads.y = 1;
threads.z=1; threads.z = 1;
cudaThreadSynchronize(); cudaThreadSynchronize();
initfftdata_core_kernel<<<grid,threads,0>>>(in,out); initfftdata_core_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize(); cudaThreadSynchronize();
grid.x=nrimz; grid.x = nrimz;
grid.y=nmid+1; grid.y = nmid + 1;
threads.x=nfast+1; threads.x = nfast + 1;
initfftdata_z_kernel<<<grid,threads,0>>>(in,out); initfftdata_z_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize(); cudaThreadSynchronize();
grid.x=nslow+1; grid.x = nslow + 1;
grid.y=nrimy; grid.y = nrimy;
threads.x=nfast+1; threads.x = nfast + 1;
initfftdata_y_kernel<<<grid,threads,0>>>(in,out); initfftdata_y_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize(); cudaThreadSynchronize();
grid.x=nslow+1; grid.x = nslow + 1;
grid.y=nmid+1; grid.y = nmid + 1;
threads.x=nrimx; threads.x = nrimx;
initfftdata_x_kernel<<<grid,threads,0>>>(in,out); initfftdata_x_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize(); cudaThreadSynchronize();
grid.x=nrimz; grid.x = nrimz;
grid.y=nrimy; grid.y = nrimy;
threads.x=nfast+1; threads.x = nfast + 1;
initfftdata_yz_kernel<<<grid,threads,0>>>(in,out); initfftdata_yz_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize(); cudaThreadSynchronize();
grid.x=nrimz; grid.x = nrimz;
grid.y=nmid+1; grid.y = nmid + 1;
threads.x=nrimx; threads.x = nrimx;
initfftdata_xz_kernel<<<grid,threads,0>>>(in,out); initfftdata_xz_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize(); cudaThreadSynchronize();
grid.x=nslow+1; grid.x = nslow + 1;
grid.y=nrimy; grid.y = nrimy;
threads.x=nrimx; threads.x = nrimx;
initfftdata_xy_kernel<<<grid,threads,0>>>(in,out); initfftdata_xy_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize(); cudaThreadSynchronize();
grid.x=nrimz; grid.x = nrimz;
grid.y=nrimy; grid.y = nrimy;
threads.x=nrimx; threads.x = nrimx;
initfftdata_xyz_kernel<<<grid,threads,0>>>(in,out); initfftdata_xyz_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize(); cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel"); CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel");
} }