git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8904 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2012-10-04 14:58:13 +00:00
parent 1f9963cb32
commit 4895715bdd
7 changed files with 2079 additions and 1992 deletions

View File

@ -14,7 +14,8 @@ SHELL = /bin/sh
# System-specific settings
CUDA_INSTALL_PATH = /usr/local/cuda
#CUDA_INSTALL_PATH = /usr/local/cuda
CUDA_INSTALL_PATH = /home/crtrott/lib/cuda
# e.g. in Gentoo
# CUDA_INSTALL_PATH = /opt/cuda
@ -97,8 +98,22 @@ else
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_21
else
CUDA_FLAGS += -DCUDA_ARCH=99
SMVERSIONFLAGS := -arch sm_13
ifeq ($(strip $(arch)), 30)
CUDA_FLAGS += -DCUDA_ARCH=20
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_30
else
ifeq ($(strip $(arch)), 35)
CUDA_FLAGS += -DCUDA_ARCH=20
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_35
else
CUDA_FLAGS += -DCUDA_ARCH=99
SMVERSIONFLAGS := -arch sm_13
endif
endif
endif
endif
endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,22 +1,22 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
@ -54,17 +54,17 @@ __device__ __constant__ F_FLOAT* MY_AP(fp);
#define _rhor_spline_tex MY_AP(rhor_spline_tex)
#if F_PRECISION == 1
texture<float4,1> _rhor_spline_tex;
texture<float4, 1> _rhor_spline_tex;
#else
texture<int4,1> _rhor_spline_tex;
texture<int4, 1> _rhor_spline_tex;
#endif
#define _z2r_spline_tex MY_AP(z2r_spline_tex)
#if F_PRECISION == 1
texture<float4,1> _z2r_spline_tex;
texture<float4, 1> _z2r_spline_tex;
#else
texture<int4,1> _z2r_spline_tex;
texture<int4, 1> _z2r_spline_tex;
#endif
@ -85,243 +85,258 @@ inline void BindEAMTextures(cuda_shared_data* sdata)
_rhor_spline_tex.normalized = false; // access with normalized texture coordinates
_rhor_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_rhor_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* rhor_spline_texture_ptr;
cudaGetTextureReference(&rhor_spline_texture_ptr, MY_CONST(rhor_spline_tex));
#if F_PRECISION == 1
const textureReference* rhor_spline_texture_ptr = &MY_AP(rhor_spline_tex);
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<float4>();
cudaBindTexture(0,rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
#else
cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
#else
cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<int4>();
cudaBindTexture(0,rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
#endif
cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
#endif
_z2r_spline_tex.normalized = false; // access with normalized texture coordinates
_z2r_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_z2r_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* z2r_spline_texture_ptr;
cudaGetTextureReference(&z2r_spline_texture_ptr, MY_CONST(z2r_spline_tex));
const textureReference* z2r_spline_texture_ptr = &MY_AP(z2r_spline_tex);
#if F_PRECISION == 1
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<float4>();
cudaBindTexture(0,z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
#else
cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
#else
cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<int4>();
cudaBindTexture(0,z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
#endif
cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
#endif
}
void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed");
int3 layout=getgrid(sneighlist->inum,7*sizeof(F_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size=(unsigned)(layout.y*layout.x)*7*sizeof(F_FLOAT);
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
if(sdata->buffer!=NULL) cudaFree(sdata->buffer);
cudaMalloc((void**)&sdata->buffer,size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) );
CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed");
CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed");
int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_FLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
if(sdata->buffer != NULL) cudaFree(sdata->buffer);
cudaMalloc((void**)&sdata->buffer, size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed");
}
void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) );
cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int));
}
void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed");
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) );
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) );
CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed");
CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed");
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed");
}
void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata,double rdr,double rdrho,int nfrho, int nrhor,int nr, int nrho,int nz2r,
void* frho_spline,void* rhor_spline,void* z2r_spline,void* rho,void* fp,
int* type2frho,int** type2z2r,int** type2rhor)
void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, int nfrho, int nrhor, int nr, int nrho, int nz2r,
void* frho_spline, void* rhor_spline, void* z2r_spline, void* rho, void* fp,
int* type2frho, int** type2z2r, int** type2rhor)
{
// !! LAMMPS indexes atom types starting with 1 !!
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2)
printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u "
"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 "
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes;
// !! LAMMPS indexes atom types starting with 1 !!
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2)
printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u "
"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 "
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes;
X_FLOAT cutsq_global;
cutsq_global = (X_FLOAT)(sdata->pair.cut_global);
cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT));
F_FLOAT* coeff_buf = new F_FLOAT[cuda_ntypes * cuda_ntypes];
for(int i = 0; i < cuda_ntypes; i++) coeff_buf[i] = type2frho[i];
cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes * sizeof(F_FLOAT));
for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2rhor[0][0])[i];
cudaMemcpyToSymbol(MY_AP(coeff2) , coeff_buf , nI);
for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2z2r[0][0])[i];
cudaMemcpyToSymbol(MY_AP(coeff3) , coeff_buf , nI);
delete [] coeff_buf;
X_FLOAT box_size[3] = {
sdata->domain.subhi[0] - sdata->domain.sublo[0],
sdata->domain.subhi[1] - sdata->domain.sublo[1],
sdata->domain.subhi[2] - sdata->domain.sublo[2]
};
F_FLOAT rdr_F = rdr;
F_FLOAT rdrho_F = rdrho;
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3);
cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(nr), &nr, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nrho), &nrho, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nfrho), &nfrho, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
rhor_spline_pointer = rhor_spline;
z2r_spline_pointer = z2r_spline;
CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed");
X_FLOAT cutsq_global;
cutsq_global = (X_FLOAT) (sdata->pair.cut_global);
cudaMemcpyToSymbol(MY_CONST(cutsq_global) ,&cutsq_global , sizeof(X_FLOAT) );
F_FLOAT* coeff_buf=new F_FLOAT[cuda_ntypes*cuda_ntypes];
for(int i=0;i<cuda_ntypes;i++) coeff_buf[i]=type2frho[i];
cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes*sizeof(F_FLOAT) );
for(int i=0;i<cuda_ntypes*cuda_ntypes;i++) coeff_buf[i]=(&type2rhor[0][0])[i];
cudaMemcpyToSymbol(MY_AP(coeff2) , coeff_buf , nI );
for(int i=0;i<cuda_ntypes*cuda_ntypes;i++) coeff_buf[i]=(&type2z2r[0][0])[i];
cudaMemcpyToSymbol(MY_AP(coeff3) , coeff_buf , nI );
delete [] coeff_buf;
X_FLOAT box_size[3] =
{
sdata->domain.subhi[0] - sdata->domain.sublo[0],
sdata->domain.subhi[1] - sdata->domain.sublo[1],
sdata->domain.subhi[2] - sdata->domain.sublo[2]
};
F_FLOAT rdr_F=rdr;
F_FLOAT rdrho_F=rdrho;
cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3);
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes), & cuda_ntypes , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity, sizeof(int)*3 );
cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(rdr), &rdr_F, sizeof(F_FLOAT) );
cudaMemcpyToSymbol(MY_CONST(rdrho), &rdrho_F, sizeof(F_FLOAT) );
cudaMemcpyToSymbol(MY_CONST(nr), &nr, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nrho), &nrho, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nfrho), &nfrho, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(rho), &rho, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(fp), &fp, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(frho_spline), &frho_spline, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(rhor_spline), &rhor_spline, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(z2r_spline), &z2r_spline, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int) );
rhor_spline_size = nrhor*(nr+1)*EAM_COEFF_LENGTH*sizeof(F_FLOAT);
z2r_spline_size = nz2r*(nr+1)*EAM_COEFF_LENGTH*sizeof(F_FLOAT);
rhor_spline_pointer = rhor_spline;
z2r_spline_pointer = z2r_spline;
CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed");
}
void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
{
if(sdata->atom.update_nmax)
Cuda_PairEAMCuda_UpdateNmax(sdata,sneighlist);
if(sdata->atom.update_nmax)
Cuda_PairEAMCuda_UpdateNmax(sdata, sneighlist);
if(sdata->atom.update_neigh)
Cuda_PairEAMCuda_UpdateNeighbor(sdata,sneighlist);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
if(sdata->buffer_new)
Cuda_PairEAMCuda_UpdateBuffer(sdata,sneighlist);
cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) );
Cuda_PairEAMCuda_UpdateNeighbor(sdata, sneighlist);
int sharedperproc=0;
if(eflag||eflag_atom) sharedperproc=1;
if(vflag||vflag_atom) sharedperproc=7;
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
eam_buff_offset=grid.x*grid.y;
BindXTypeTexture(sdata);
BindEAMTextures( sdata);// initialize only on first call
if(sdata->buffer_new)
Cuda_PairEAMCuda_UpdateBuffer(sdata, sneighlist);
MYDBG( printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n",eflag,vflag); )
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation");
PairEAMCuda_Kernel1<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag,eflag_atom,vflag_atom);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed");
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
int sharedperproc = 0;
if(eflag || eflag_atom) sharedperproc = 1;
if(vflag || vflag_atom) sharedperproc = 7;
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
eam_buff_offset = grid.x * grid.y;
BindXTypeTexture(sdata);
BindEAMTextures(sdata); // initialize only on first call
MYDBG( printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n"); )
MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);)
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation");
PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed");
MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");)
}
void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
{
int sharedperproc=0;
if(eflag||eflag_atom) sharedperproc=1;
if(vflag||vflag_atom) sharedperproc=7;
int3 layout=getgrid(sneighlist->inum,sharedperproc*sizeof(ENERGY_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
BindXTypeTexture(sdata);
BindEAMTextures( sdata);// initialize only on first call
// initialize only on first call
sdata->pair.lastgridsize=grid.x*grid.y;
sdata->pair.n_energy_virial=sharedperproc;
int sharedperproc = 0;
MYDBG( printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n",eflag,vflag); )
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation");
PairEAMCuda_Kernel2<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag,eflag_atom,vflag_atom);
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed");
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed");
if(eflag||vflag)
{
int n=grid.x*grid.y;
grid.x=sharedperproc;
grid.y=1;
threads.x=256;
MY_AP(PairVirialCompute_reduce)<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)*sharedperproc>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed");
}
if(eflag || eflag_atom) sharedperproc = 1;
if(vflag || vflag_atom) sharedperproc = 7;
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
BindXTypeTexture(sdata);
BindEAMTextures(sdata); // initialize only on first call
// initialize only on first call
sdata->pair.lastgridsize = grid.x * grid.y;
sdata->pair.n_energy_virial = sharedperproc;
MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);)
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation");
PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed");
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed");
if(eflag || vflag) {
int n = grid.x * grid.y;
grid.x = sharedperproc;
grid.y = 1;
threads.x = 256;
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)*sharedperproc>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed");
}
MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");)
MYDBG( printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n"); )
}
void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send)
void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send)
{
int3 layout=getgrid(n,0);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
F_FLOAT* buf=(F_FLOAT*) (& ((double*)sdata->buffer)[eam_buff_offset]);
PairEAMCuda_PackComm_Kernel<<<grid, threads,0>>> ((int*) sdata->comm.sendlist.dev_data,n
,sdata->comm.maxlistlength,iswap,buf);
cudaThreadSynchronize();
cudaMemcpy(buf_send, buf, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
int3 layout = getgrid(n, 0);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
F_FLOAT* buf = (F_FLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]);
PairEAMCuda_PackComm_Kernel <<< grid, threads, 0>>> ((int*) sdata->comm.sendlist.dev_data, n
, sdata->comm.maxlistlength, iswap, buf);
cudaThreadSynchronize();
cudaMemcpy(buf_send, buf, n* sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
}
void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,void* fp)
void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, void* fp)
{
F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]);
cudaMemcpy(fp_first,buf_recv, n*sizeof(F_FLOAT), cudaMemcpyHostToDevice);
F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]);
cudaMemcpy(fp_first, buf_recv, n * sizeof(F_FLOAT), cudaMemcpyHostToDevice);
}
#undef _type2frho

View File

@ -1,22 +1,22 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
@ -24,116 +24,115 @@
#include <stdio.h>
#include "pair_sw_cuda_cu.h"
__device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR*MANYBODY_NPAIR*MANYBODY_NPAIR];
__device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR];
#include "pair_sw_cuda_kernel_nc.cu"
#include <time.h>
void Cuda_PairSWCuda_Init(cuda_shared_data* sdata,ParamSW_Float* params_host,void* map_host, void* elem2param_host,int nelements_h)
void Cuda_PairSWCuda_Init(cuda_shared_data* sdata, ParamSW_Float* params_host, void* map_host, void* elem2param_host, int nelements_h)
{
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
X_FLOAT box_size[3] =
{
X_FLOAT box_size[3] = {
sdata->domain.subhi[0] - sdata->domain.sublo[0],
sdata->domain.subhi[1] - sdata->domain.sublo[1],
sdata->domain.subhi[2] - sdata->domain.sublo[2]
};
cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3);
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 );
cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) );
cudaMemcpyToSymbol("params_sw", params_host , sizeof(ParamSW_Float)*nelements_h*nelements_h*nelements_h );
cudaMemcpyToSymbol("elem2param",elem2param_host , sizeof(int)*nelements_h*nelements_h*nelements_h );
cudaMemcpyToSymbol("map",map_host , sizeof(int)*cuda_ntypes );
cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int));
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3);
cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
cudaMemcpyToSymbol(params_sw, params_host , sizeof(ParamSW_Float)*nelements_h * nelements_h * nelements_h);
cudaMemcpyToSymbol(elem2param, elem2param_host , sizeof(int)*nelements_h * nelements_h * nelements_h);
cudaMemcpyToSymbol(map, map_host , sizeof(int)*cuda_ntypes);
cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int));
}
void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
{
static int glob_ij_size=0;
static F_FLOAT4* glob_r_ij=NULL;
static int* glob_numneigh_red=NULL;
static int* glob_neighbors_red=NULL;
static int* glob_neightype_red=NULL;
static int glob_ij_size = 0;
static F_FLOAT4* glob_r_ij = NULL;
static int* glob_numneigh_red = NULL;
static int* glob_neighbors_red = NULL;
static int* glob_neightype_red = NULL;
if(glob_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT))
{
glob_ij_size = sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT);
if(glob_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) {
glob_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT);
cudaFree(glob_r_ij);
cudaFree(glob_numneigh_red);
cudaFree(glob_neighbors_red);
cudaFree(glob_neightype_red);
cudaMalloc(&glob_r_ij,glob_ij_size*4);
cudaMalloc(&glob_numneigh_red,sdata->atom.nall*sizeof(int));
cudaMalloc(&glob_neighbors_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int));
cudaMalloc(&glob_neightype_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int));
cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red , sizeof(int*) );
cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red , sizeof(int*) );
cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red , sizeof(int*) );
cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij , sizeof(F_FLOAT4*) );
}
dim3 grid,threads;
int sharedperproc;
cudaMalloc(&glob_r_ij, glob_ij_size * 4);
cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int));
cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*));
cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*));
cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*));
cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*));
}
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,64);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
dim3 grid, threads;
int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
dim3 grid2;
if(sdata->atom.nall<=256*64000){
grid2.x = (sdata->atom.nall+255)/256;
grid2.y = 1;
} else {
grid2.x = (sdata->atom.nall+256*128-1)/(256*128);
dim3 grid2;
if(sdata->atom.nall <= 256 * 64000) {
grid2.x = (sdata->atom.nall + 255) / 256;
grid2.y = 1;
} else {
grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128);
grid2.y = 128;
}
grid2.z = 1;
grid2.z = 1;
dim3 threads2;
threads2.x = 256;
threads2.y = 1;
threads2.z = 1;
timespec time1,time2;
timespec time1, time2;
//pre-calculate all neighbordistances and zeta_ij
clock_gettime(CLOCK_REALTIME,&time1);
Pair_SW_Kernel_TpA_RIJ<<<grid2, threads2,0,streams[1]>>>();
clock_gettime(CLOCK_REALTIME, &time1);
Pair_SW_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>();
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.test1+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
clock_gettime(CLOCK_REALTIME,&time1);
clock_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.test1 +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
clock_gettime(CLOCK_REALTIME, &time1);
//actual force calculation
unsigned int sharedsize=(sharedperproc*sizeof(ENERGY_FLOAT)+4*sizeof(F_FLOAT))*threads.x; //extra 4 floats per thread used to reduce register pressure
if(eflag)
{
unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
if(eflag) {
if(vflag)
Pair_SW_Kernel_TpA<1,1><<<grid, threads,sharedsize,streams[1]>>>
(eflag_atom,vflag_atom);
Pair_SW_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom, vflag_atom);
else
Pair_SW_Kernel_TpA<1,0><<<grid, threads,sharedsize,streams[1]>>>
(eflag_atom,vflag_atom);
}
else
{
Pair_SW_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom, vflag_atom);
} else {
if(vflag)
Pair_SW_Kernel_TpA<0,1><<<grid, threads,sharedsize,streams[1]>>>
(eflag_atom,vflag_atom);
Pair_SW_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom, vflag_atom);
else
Pair_SW_Kernel_TpA<0,0><<<grid, threads,sharedsize,streams[1]>>>
(eflag_atom,vflag_atom);
Pair_SW_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom, vflag_atom);
}
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.test2+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
clock_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.test2 +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -1,22 +1,22 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
@ -25,7 +25,7 @@
#include "pair_tersoff_cuda_cu.h"
__device__ __constant__ Param_Float params[MANYBODY_NPAIR*MANYBODY_NPAIR*MANYBODY_NPAIR];
__device__ __constant__ Param_Float params[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR];
__device__ __constant__ F_FLOAT* _glob_zeta_ij; //zeta_ij
__device__ __constant__ F_FLOAT4* _glob_r_ij; //r_ij (x,y,z,r^2) for pairs within force cutoff
__device__ __constant__ bool _zbl; //is tersoff zbl?
@ -36,119 +36,118 @@ __device__ __constant__ bool _zbl; //is tersoff zbl?
#include <time.h>
void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata,Param_Float* params_host,void* map_host, void* elem2param_host,int nelements_h,bool zbl)
void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host, void* map_host, void* elem2param_host, int nelements_h, bool zbl)
{
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
X_FLOAT box_size[3] =
{
X_FLOAT box_size[3] = {
sdata->domain.subhi[0] - sdata->domain.sublo[0],
sdata->domain.subhi[1] - sdata->domain.sublo[1],
sdata->domain.subhi[2] - sdata->domain.sublo[2]
};
cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3);
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 );
cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) );
cudaMemcpyToSymbol("params", params_host , sizeof(Param_Float)*nelements_h*nelements_h*nelements_h );
cudaMemcpyToSymbol("elem2param",elem2param_host , sizeof(int)*nelements_h*nelements_h*nelements_h );
cudaMemcpyToSymbol("map",map_host , sizeof(int)*cuda_ntypes );
cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int));
cudaMemcpyToSymbol("_zbl",&zbl,sizeof(bool));
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3);
cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
cudaMemcpyToSymbol(params, params_host , sizeof(Param_Float)*nelements_h * nelements_h * nelements_h);
cudaMemcpyToSymbol(elem2param, elem2param_host , sizeof(int)*nelements_h * nelements_h * nelements_h);
cudaMemcpyToSymbol(map, map_host , sizeof(int)*cuda_ntypes);
cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int));
cudaMemcpyToSymbol(_zbl, &zbl, sizeof(bool));
}
void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
{
static F_FLOAT* glob_zeta_ij=NULL;
static int glob_zeta_ij_size=0;
static F_FLOAT4* glob_r_ij=NULL;
static int* glob_numneigh_red=NULL;
static int* glob_neighbors_red=NULL;
static int* glob_neightype_red=NULL;
static F_FLOAT* glob_zeta_ij = NULL;
static int glob_zeta_ij_size = 0;
static F_FLOAT4* glob_r_ij = NULL;
static int* glob_numneigh_red = NULL;
static int* glob_neighbors_red = NULL;
static int* glob_neightype_red = NULL;
if(glob_zeta_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT))
{
glob_zeta_ij_size = sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT);
cudaFree(glob_zeta_ij);
if(glob_zeta_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) {
glob_zeta_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT);
cudaFree(glob_zeta_ij);
cudaFree(glob_r_ij);
cudaFree(glob_numneigh_red);
cudaFree(glob_neighbors_red);
cudaFree(glob_neightype_red);
cudaMalloc(&glob_zeta_ij,glob_zeta_ij_size);
cudaMalloc(&glob_r_ij,glob_zeta_ij_size*4);
cudaMalloc(&glob_numneigh_red,sdata->atom.nall*sizeof(int));
cudaMalloc(&glob_neighbors_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int));
cudaMalloc(&glob_neightype_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int));
cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red , sizeof(int*) );
cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red , sizeof(int*) );
cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red , sizeof(int*) );
cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij , sizeof(F_FLOAT4*) );
cudaMemcpyToSymbol("_glob_zeta_ij", &glob_zeta_ij , sizeof(F_FLOAT*) );
}
dim3 grid,threads;
int sharedperproc;
cudaMalloc(&glob_zeta_ij, glob_zeta_ij_size);
cudaMalloc(&glob_r_ij, glob_zeta_ij_size * 4);
cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int));
cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*));
cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*));
cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*));
cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*));
cudaMemcpyToSymbol(_glob_zeta_ij, &glob_zeta_ij , sizeof(F_FLOAT*));
}
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,64);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
dim3 grid, threads;
int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
dim3 grid2;
if(sdata->atom.nall<=256*64000){
grid2.x = (sdata->atom.nall+255)/256;
grid2.y = 1;
} else {
grid2.x = (sdata->atom.nall+256*128-1)/(256*128);
dim3 grid2;
if(sdata->atom.nall <= 256 * 64000) {
grid2.x = (sdata->atom.nall + 255) / 256;
grid2.y = 1;
} else {
grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128);
grid2.y = 128;
}
grid2.z = 1;
grid2.z = 1;
dim3 threads2;
threads2.x = 256;
threads2.y = 1;
threads2.z = 1;
timespec time1,time2;
timespec time1, time2;
//pre-calculate all neighbordistances and zeta_ij
clock_gettime(CLOCK_REALTIME,&time1);
Pair_Tersoff_Kernel_TpA_RIJ<<<grid2, threads2,0,streams[1]>>>
();
clock_gettime(CLOCK_REALTIME, &time1);
Pair_Tersoff_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>
();
cudaThreadSynchronize();
Pair_Tersoff_Kernel_TpA_ZetaIJ<<<grid2, threads2,0,streams[1]>>>
();
Pair_Tersoff_Kernel_TpA_ZetaIJ <<< grid2, threads2, 0, streams[1]>>>
();
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.test1+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
clock_gettime(CLOCK_REALTIME,&time1);
clock_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.test1 +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
clock_gettime(CLOCK_REALTIME, &time1);
//actual force calculation
unsigned int sharedsize=(sharedperproc*sizeof(ENERGY_FLOAT)+4*sizeof(F_FLOAT))*threads.x; //extra 4 floats per thread used to reduce register pressure
if(eflag)
{
unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
if(eflag) {
if(vflag)
Pair_Tersoff_Kernel_TpA<1,1><<<grid, threads,sharedsize,streams[1]>>>
(eflag_atom,vflag_atom);
Pair_Tersoff_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom, vflag_atom);
else
Pair_Tersoff_Kernel_TpA<1,0><<<grid, threads,sharedsize,streams[1]>>>
(eflag_atom,vflag_atom);
}
else
{
Pair_Tersoff_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom, vflag_atom);
} else {
if(vflag)
Pair_Tersoff_Kernel_TpA<0,1><<<grid, threads,sharedsize,streams[1]>>>
(eflag_atom,vflag_atom);
Pair_Tersoff_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom, vflag_atom);
else
Pair_Tersoff_Kernel_TpA<0,0><<<grid, threads,sharedsize,streams[1]>>>
(eflag_atom,vflag_atom);
Pair_Tersoff_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>>
(eflag_atom, vflag_atom);
}
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.test2+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
clock_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.test2 +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -1,22 +1,22 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
@ -34,484 +34,493 @@
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
__device__ __constant__ FFT_FLOAT* work1;
__device__ __constant__ FFT_FLOAT* work2;
__device__ __constant__ FFT_FLOAT* work3;
__device__ __constant__ PPPM_FLOAT* greensfn;
__device__ __constant__ PPPM_FLOAT* gf_b;
__device__ __constant__ PPPM_FLOAT* fkx;
__device__ __constant__ PPPM_FLOAT* fky;
__device__ __constant__ PPPM_FLOAT* fkz;
__device__ __constant__ PPPM_FLOAT* vg;
__device__ __constant__ int* part2grid;
__device__ __constant__ PPPM_FLOAT* density_brick;
__device__ __constant__ int* density_brick_int;
__device__ __constant__ PPPM_FLOAT density_intScale;
__device__ __constant__ PPPM_FLOAT* vdx_brick;
__device__ __constant__ PPPM_FLOAT* vdy_brick;
__device__ __constant__ PPPM_FLOAT* vdz_brick;
__device__ __constant__ PPPM_FLOAT* density_fft;
__device__ __constant__ ENERGY_FLOAT* energy;
__device__ __constant__ ENERGY_FLOAT* virial;
__device__ __constant__ int nxlo_in;
__device__ __constant__ int nxhi_in;
__device__ __constant__ int nxlo_out;
__device__ __constant__ int nxhi_out;
__device__ __constant__ int nylo_in;
__device__ __constant__ int nyhi_in;
__device__ __constant__ int nylo_out;
__device__ __constant__ int nyhi_out;
__device__ __constant__ int nzlo_in;
__device__ __constant__ int nzhi_in;
__device__ __constant__ int nzlo_out;
__device__ __constant__ int nzhi_out;
__device__ __constant__ int nxlo_fft;
__device__ __constant__ int nxhi_fft;
__device__ __constant__ int nylo_fft;
__device__ __constant__ int nyhi_fft;
__device__ __constant__ int nzlo_fft;
__device__ __constant__ int nzhi_fft;
__device__ __constant__ int nx_pppm;
__device__ __constant__ int ny_pppm;
__device__ __constant__ int nz_pppm;
__device__ __constant__ int slabflag;
__device__ __constant__ PPPM_FLOAT qqrd2e;
__device__ __constant__ int order;
//__device__ __constant__ float3 sublo;
__device__ __constant__ PPPM_FLOAT* rho_coeff;
__device__ __constant__ int nmax;
__device__ __constant__ int nlocal;
__device__ __constant__ PPPM_FLOAT* debugdata;
__device__ __constant__ PPPM_FLOAT delxinv;
__device__ __constant__ PPPM_FLOAT delyinv;
__device__ __constant__ PPPM_FLOAT delzinv;
__device__ __constant__ int nlower;
__device__ __constant__ int nupper;
__device__ __constant__ PPPM_FLOAT shiftone;
__device__ __constant__ FFT_FLOAT* work1;
__device__ __constant__ FFT_FLOAT* work2;
__device__ __constant__ FFT_FLOAT* work3;
__device__ __constant__ PPPM_FLOAT* greensfn;
__device__ __constant__ PPPM_FLOAT* gf_b;
__device__ __constant__ PPPM_FLOAT* fkx;
__device__ __constant__ PPPM_FLOAT* fky;
__device__ __constant__ PPPM_FLOAT* fkz;
__device__ __constant__ PPPM_FLOAT* vg;
__device__ __constant__ int* part2grid;
__device__ __constant__ PPPM_FLOAT* density_brick;
__device__ __constant__ int* density_brick_int;
__device__ __constant__ PPPM_FLOAT density_intScale;
__device__ __constant__ PPPM_FLOAT* vdx_brick;
__device__ __constant__ PPPM_FLOAT* vdy_brick;
__device__ __constant__ PPPM_FLOAT* vdz_brick;
__device__ __constant__ PPPM_FLOAT* density_fft;
__device__ __constant__ ENERGY_FLOAT* energy;
__device__ __constant__ ENERGY_FLOAT* virial;
__device__ __constant__ int nxlo_in;
__device__ __constant__ int nxhi_in;
__device__ __constant__ int nxlo_out;
__device__ __constant__ int nxhi_out;
__device__ __constant__ int nylo_in;
__device__ __constant__ int nyhi_in;
__device__ __constant__ int nylo_out;
__device__ __constant__ int nyhi_out;
__device__ __constant__ int nzlo_in;
__device__ __constant__ int nzhi_in;
__device__ __constant__ int nzlo_out;
__device__ __constant__ int nzhi_out;
__device__ __constant__ int nxlo_fft;
__device__ __constant__ int nxhi_fft;
__device__ __constant__ int nylo_fft;
__device__ __constant__ int nyhi_fft;
__device__ __constant__ int nzlo_fft;
__device__ __constant__ int nzhi_fft;
__device__ __constant__ int nx_pppm;
__device__ __constant__ int ny_pppm;
__device__ __constant__ int nz_pppm;
__device__ __constant__ int slabflag;
__device__ __constant__ PPPM_FLOAT qqrd2e;
__device__ __constant__ int order;
//__device__ __constant__ float3 sublo;
__device__ __constant__ PPPM_FLOAT* rho_coeff;
__device__ __constant__ int nmax;
__device__ __constant__ int nlocal;
__device__ __constant__ PPPM_FLOAT* debugdata;
__device__ __constant__ PPPM_FLOAT delxinv;
__device__ __constant__ PPPM_FLOAT delyinv;
__device__ __constant__ PPPM_FLOAT delzinv;
__device__ __constant__ int nlower;
__device__ __constant__ int nupper;
__device__ __constant__ PPPM_FLOAT shiftone;
#include "pppm_cuda_kernel.cu"
#include "stdio.h"
void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial
,void* cu_work1,void* cu_work2, void* cu_work3,void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg
,int cu_nxlo_in,int cu_nxhi_in,int cu_nylo_in,int cu_nyhi_in,int cu_nzlo_in,int cu_nzhi_in,int cu_nxlo_out,int cu_nxhi_out,int cu_nylo_out,int cu_nyhi_out,int cu_nzlo_out,int cu_nzhi_out,int cu_nx_pppm,int cu_ny_pppm,int cu_nz_pppm
,int cu_nxlo_fft,int cu_nxhi_fft,int cu_nylo_fft,int cu_nyhi_fft,int cu_nzlo_fft,int cu_nzhi_fft,void* cu_gf_b
,double cu_qqrd2e, int cu_order, void* cu_rho_coeff,void* cu_debugdata,void* cu_density_brick_int,int cu_slabflag
)
, void* cu_work1, void* cu_work2, void* cu_work3, void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg
, int cu_nxlo_in, int cu_nxhi_in, int cu_nylo_in, int cu_nyhi_in, int cu_nzlo_in, int cu_nzhi_in, int cu_nxlo_out, int cu_nxhi_out, int cu_nylo_out, int cu_nyhi_out, int cu_nzlo_out, int cu_nzhi_out, int cu_nx_pppm, int cu_ny_pppm, int cu_nz_pppm
, int cu_nxlo_fft, int cu_nxhi_fft, int cu_nylo_fft, int cu_nyhi_fft, int cu_nzlo_fft, int cu_nzhi_fft, void* cu_gf_b
, double cu_qqrd2e, int cu_order, void* cu_rho_coeff, void* cu_debugdata, void* cu_density_brick_int, int cu_slabflag
)
{
CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start");
cudaMemcpyToSymbol("density_brick",&cu_density_brick, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("density_brick_int",&cu_density_brick_int, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("vdx_brick",&cu_vdx_brick, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("vdy_brick",&cu_vdy_brick, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("vdz_brick",&cu_vdz_brick, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("density_fft",&cu_density_fft, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("energy",&cu_energy, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol("virial",&cu_virial, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol("nxlo_in",&cu_nxlo_in, sizeof(int));
cudaMemcpyToSymbol("nxhi_in",&cu_nxhi_in, sizeof(int));
cudaMemcpyToSymbol("nxlo_out",&cu_nxlo_out, sizeof(int));
cudaMemcpyToSymbol("nxhi_out",&cu_nxhi_out, sizeof(int));
cudaMemcpyToSymbol("nylo_in",&cu_nylo_in, sizeof(int));
cudaMemcpyToSymbol("nyhi_in",&cu_nyhi_in, sizeof(int));
cudaMemcpyToSymbol("nylo_out",&cu_nylo_out, sizeof(int));
cudaMemcpyToSymbol("nyhi_out",&cu_nyhi_out, sizeof(int));
cudaMemcpyToSymbol("nzlo_in",&cu_nzlo_in, sizeof(int));
cudaMemcpyToSymbol("nzhi_in",&cu_nzhi_in, sizeof(int));
cudaMemcpyToSymbol("nzlo_out",&cu_nzlo_out, sizeof(int));
cudaMemcpyToSymbol("nzhi_out",&cu_nzhi_out, sizeof(int));
cudaMemcpyToSymbol("nxlo_fft",&cu_nxlo_fft, sizeof(int));
cudaMemcpyToSymbol("nxhi_fft",&cu_nxhi_fft, sizeof(int));
cudaMemcpyToSymbol("nylo_fft",&cu_nylo_fft, sizeof(int));
cudaMemcpyToSymbol("nyhi_fft",&cu_nyhi_fft, sizeof(int));
cudaMemcpyToSymbol("nzlo_fft",&cu_nzlo_fft, sizeof(int));
cudaMemcpyToSymbol("nzhi_fft",&cu_nzhi_fft, sizeof(int));
cudaMemcpyToSymbol("slabflag",&cu_slabflag, sizeof(int));
cudaMemcpyToSymbol("nx_pppm",&cu_nx_pppm, sizeof(int));
cudaMemcpyToSymbol("ny_pppm",&cu_ny_pppm, sizeof(int));
cudaMemcpyToSymbol("nz_pppm",&cu_nz_pppm, sizeof(int));
cudaMemcpyToSymbol("work1",&cu_work1, sizeof(FFT_FLOAT*));
cudaMemcpyToSymbol("work2",&cu_work2, sizeof(FFT_FLOAT*));
cudaMemcpyToSymbol("work3",&cu_work3, sizeof(FFT_FLOAT*));
cudaMemcpyToSymbol("greensfn",&cu_greensfn, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("gf_b",&cu_gf_b, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("fkx",&cu_fkx, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("fky",&cu_fky, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("fkz",&cu_fkz, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("vg",&cu_vg, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(density_brick, &cu_density_brick, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(density_brick_int, &cu_density_brick_int, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(vdx_brick, &cu_vdx_brick, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(vdy_brick, &cu_vdy_brick, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(vdz_brick, &cu_vdz_brick, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(density_fft, &cu_density_fft, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(energy, &cu_energy, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(virial, &cu_virial, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(nxlo_in, &cu_nxlo_in, sizeof(int));
cudaMemcpyToSymbol(nxhi_in, &cu_nxhi_in, sizeof(int));
cudaMemcpyToSymbol(nxlo_out, &cu_nxlo_out, sizeof(int));
cudaMemcpyToSymbol(nxhi_out, &cu_nxhi_out, sizeof(int));
cudaMemcpyToSymbol(nylo_in, &cu_nylo_in, sizeof(int));
cudaMemcpyToSymbol(nyhi_in, &cu_nyhi_in, sizeof(int));
cudaMemcpyToSymbol(nylo_out, &cu_nylo_out, sizeof(int));
cudaMemcpyToSymbol(nyhi_out, &cu_nyhi_out, sizeof(int));
cudaMemcpyToSymbol(nzlo_in, &cu_nzlo_in, sizeof(int));
cudaMemcpyToSymbol(nzhi_in, &cu_nzhi_in, sizeof(int));
cudaMemcpyToSymbol(nzlo_out, &cu_nzlo_out, sizeof(int));
cudaMemcpyToSymbol(nzhi_out, &cu_nzhi_out, sizeof(int));
cudaMemcpyToSymbol(nxlo_fft, &cu_nxlo_fft, sizeof(int));
cudaMemcpyToSymbol(nxhi_fft, &cu_nxhi_fft, sizeof(int));
cudaMemcpyToSymbol(nylo_fft, &cu_nylo_fft, sizeof(int));
cudaMemcpyToSymbol(nyhi_fft, &cu_nyhi_fft, sizeof(int));
cudaMemcpyToSymbol(nzlo_fft, &cu_nzlo_fft, sizeof(int));
cudaMemcpyToSymbol(nzhi_fft, &cu_nzhi_fft, sizeof(int));
cudaMemcpyToSymbol(slabflag, &cu_slabflag, sizeof(int));
cudaMemcpyToSymbol(nx_pppm, &cu_nx_pppm, sizeof(int));
cudaMemcpyToSymbol(ny_pppm, &cu_ny_pppm, sizeof(int));
cudaMemcpyToSymbol(nz_pppm, &cu_nz_pppm, sizeof(int));
cudaMemcpyToSymbol(work1, &cu_work1, sizeof(FFT_FLOAT*));
cudaMemcpyToSymbol(work2, &cu_work2, sizeof(FFT_FLOAT*));
cudaMemcpyToSymbol(work3, &cu_work3, sizeof(FFT_FLOAT*));
cudaMemcpyToSymbol(greensfn, &cu_greensfn, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(gf_b, &cu_gf_b, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(fkx, &cu_fkx, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(fky, &cu_fky, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(fkz, &cu_fkz, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(vg, &cu_vg, sizeof(PPPM_FLOAT*));
PPPM_FLOAT cu_qqrd2e_a = cu_qqrd2e;
cudaMemcpyToSymbol(qqrd2e, &cu_qqrd2e_a, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol(order, &cu_order, sizeof(int));
cudaMemcpyToSymbol(rho_coeff, &cu_rho_coeff, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol(debugdata, &cu_debugdata, sizeof(PPPM_FLOAT*));
PPPM_FLOAT cu_qqrd2e_a=cu_qqrd2e;
cudaMemcpyToSymbol("qqrd2e",&cu_qqrd2e_a, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol("order",&cu_order, sizeof(int));
cudaMemcpyToSymbol("rho_coeff",&cu_rho_coeff, sizeof(PPPM_FLOAT*));
cudaMemcpyToSymbol("debugdata",&cu_debugdata, sizeof(PPPM_FLOAT*));
CUT_CHECK_ERROR("ERROR-CUDA poisson_init");
/*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n");
/*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n");
#ifdef PPPM_PRECISION
if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n");
if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n");
#endif
#ifdef ENERGY_PRECISION
if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n");
if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n");
#endif
#ifdef ENERGY_PRECISION
if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n");
if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n");
#endif
#ifdef X_PRECISION
if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n");
if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n");
#endif
#ifdef F_PRECISION
if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n");
if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n");
#endif*/
#ifdef PPPM_PRECISION
if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n");
if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n");
#endif
#ifdef ENERGY_PRECISION
if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n");
if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n");
#endif
#ifdef ENERGY_PRECISION
if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n");
if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n");
#endif
#ifdef X_PRECISION
if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n");
if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n");
#endif
#ifdef F_PRECISION
if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n");
if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n");
#endif*/
}
void pppm_device_init_setup(cuda_shared_data* sdata,PPPM_FLOAT cu_shiftone,PPPM_FLOAT cu_delxinv,PPPM_FLOAT cu_delyinv,PPPM_FLOAT cu_delzinv,int cu_nlower,int cu_nupper)
void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_FLOAT cu_shiftone, PPPM_FLOAT cu_delxinv, PPPM_FLOAT cu_delyinv, PPPM_FLOAT cu_delzinv, int cu_nlower, int cu_nupper)
{
cudaMemcpyToSymbol("delxinv",&cu_delxinv, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol("delyinv",&cu_delyinv, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol("delzinv",&cu_delzinv, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol("shiftone",&cu_shiftone, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol("nlower",&cu_nlower, sizeof(int));
cudaMemcpyToSymbol("nupper",&cu_nupper, sizeof(int));
cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo, 3*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi, 3*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(boxlo) , sdata->domain.boxlo, 3*sizeof(X_FLOAT));
cudaMemcpyToSymbol(delxinv, &cu_delxinv, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol(delyinv, &cu_delyinv, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol(delzinv, &cu_delzinv, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol(shiftone, &cu_shiftone, sizeof(PPPM_FLOAT));
cudaMemcpyToSymbol(nlower, &cu_nlower, sizeof(int));
cudaMemcpyToSymbol(nupper, &cu_nupper, sizeof(int));
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo, 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi, 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo, 3 * sizeof(X_FLOAT));
CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup");
}
void pppm_device_update(cuda_shared_data* sdata,void* cu_part2grid, int nlocala,int nmaxa)
void pppm_device_update(cuda_shared_data* sdata, void* cu_part2grid, int nlocala, int nmaxa)
{
cudaMemcpyToSymbol("part2grid",&cu_part2grid, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
//cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int));
cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int));
cudaMemcpyToSymbol("nmax" , &nmaxa, sizeof(int));
cudaMemcpyToSymbol("part2grid", &cu_part2grid, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
//cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int));
cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int));
cudaMemcpyToSymbol(nmax , &nmaxa, sizeof(int));
CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update");
}
void pppm_update_nlocal(int nlocala)
{
cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int));
CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b");
cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int));
CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b");
}
void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald)
void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald)
{
dim3 grid;
dim3 threads;
grid.x=nz_pppma;
grid.y=ny_pppma;
grid.z=1;
threads.x=nx_pppma;
threads.y=1;
threads.z=1;
setup_fkxyz_vg<<<grid,threads,0>>>(unitkx,unitky,unitkz,g_ewald);
grid.x = nz_pppma;
grid.y = ny_pppma;
grid.z = 1;
threads.x = nx_pppma;
threads.y = 1;
threads.z = 1;
setup_fkxyz_vg <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald);
cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg ");
}
void Cuda_PPPM_setup_greensfn(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald,
int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab)
void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald,
int nbx, int nby, int nbz, PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab)
{
dim3 grid;
dim3 threads;
grid.x=nz_pppma;
grid.y=ny_pppma;
grid.z=1;
threads.x=nx_pppma;
threads.y=1;
threads.z=1;
setup_greensfn<<<grid,threads,0>>>(unitkx,unitky,unitkz,g_ewald,nbx,nby,nbz,xprd,yprd, zprd_slab);
grid.x = nz_pppma;
grid.y = ny_pppma;
grid.z = 1;
threads.x = nx_pppma;
threads.y = 1;
threads.z = 1;
setup_greensfn <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald, nbx, nby, nbz, xprd, yprd, zprd_slab);
cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn ");
}
void poisson_scale(int nx_pppma,int ny_pppma,int nz_pppma)
void poisson_scale(int nx_pppma, int ny_pppma, int nz_pppma)
{
dim3 grid;
dim3 threads;
grid.x=nz_pppma;
grid.y=ny_pppma;
grid.z=1;
threads.x=nx_pppma;
threads.y=1;
threads.z=1;
poisson_scale_kernel<<<grid,threads,0>>>();
grid.x = nz_pppma;
grid.y = ny_pppma;
grid.z = 1;
threads.x = nx_pppma;
threads.y = 1;
threads.z = 1;
poisson_scale_kernel <<< grid, threads, 0>>>();
CUT_CHECK_ERROR("ERROR-CUDA poisson_scale ");
}
void poisson_xgrad(int nx_pppma,int ny_pppma,int nz_pppma)
void poisson_xgrad(int nx_pppma, int ny_pppma, int nz_pppma)
{
dim3 grid;
dim3 threads;
grid.x=nz_pppma;
grid.y=ny_pppma;
grid.z=1;
threads.x=nx_pppma;
threads.y=1;
threads.z=1;
poisson_xgrad_kernel<<<grid,threads,0>>>();
grid.x = nz_pppma;
grid.y = ny_pppma;
grid.z = 1;
threads.x = nx_pppma;
threads.y = 1;
threads.z = 1;
poisson_xgrad_kernel <<< grid, threads, 0>>>();
CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad ");
}
void poisson_ygrad(int nx_pppma,int ny_pppma,int nz_pppma)
void poisson_ygrad(int nx_pppma, int ny_pppma, int nz_pppma)
{
dim3 grid;
dim3 threads;
grid.x=nz_pppma;
grid.y=ny_pppma;
grid.z=1;
threads.x=nx_pppma;
threads.y=1;
threads.z=1;
poisson_ygrad_kernel<<<grid,threads,0>>>();
grid.x = nz_pppma;
grid.y = ny_pppma;
grid.z = 1;
threads.x = nx_pppma;
threads.y = 1;
threads.z = 1;
poisson_ygrad_kernel <<< grid, threads, 0>>>();
CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad ");
}
void poisson_zgrad(int nx_pppma,int ny_pppma,int nz_pppma)
void poisson_zgrad(int nx_pppma, int ny_pppma, int nz_pppma)
{
dim3 grid;
dim3 threads;
grid.x=nz_pppma;
grid.y=ny_pppma;
grid.z=1;
threads.x=nx_pppma;
threads.y=1;
threads.z=1;
poisson_zgrad_kernel<<<grid,threads,0>>>();
grid.x = nz_pppma;
grid.y = ny_pppma;
grid.z = 1;
threads.x = nx_pppma;
threads.y = 1;
threads.z = 1;
poisson_zgrad_kernel <<< grid, threads, 0>>>();
CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad ");
}
void poisson_vdx_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppma,int ny_pppma,int nz_pppma)
void poisson_vdx_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppma, int ny_pppma, int nz_pppma)
{
dim3 grid;
dim3 threads;
grid.x=khi-klo+1;
grid.y=jhi-jlo+1;
grid.z=1;
threads.x=ihi-ilo+1;
threads.y=1;
threads.z=1;
grid.x = khi - klo + 1;
grid.y = jhi - jlo + 1;
grid.z = 1;
threads.x = ihi - ilo + 1;
threads.y = 1;
threads.z = 1;
//printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x);
poisson_vdx_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo);
poisson_vdx_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo);
CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick ");
cudaThreadSynchronize();
}
void poisson_vdy_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm)
void poisson_vdy_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm)
{
dim3 grid;
dim3 threads;
grid.x=khi-klo+1;
grid.y=jhi-jlo+1;
grid.z=1;
threads.x=ihi-ilo+1;
threads.y=1;
threads.z=1;
poisson_vdy_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo);
grid.x = khi - klo + 1;
grid.y = jhi - jlo + 1;
grid.z = 1;
threads.x = ihi - ilo + 1;
threads.y = 1;
threads.z = 1;
poisson_vdy_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo);
CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick ");
cudaThreadSynchronize();
}
void poisson_vdz_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm)
void poisson_vdz_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm)
{
dim3 grid;
dim3 threads;
grid.x=khi-klo+1;
grid.y=jhi-jlo+1;
grid.z=1;
threads.x=ihi-ilo+1;
threads.y=1;
threads.z=1;
poisson_vdz_brick_kernel<<<grid,threads,0>>>(ilo,jlo,klo);
grid.x = khi - klo + 1;
grid.y = jhi - jlo + 1;
grid.z = 1;
threads.x = ihi - ilo + 1;
threads.y = 1;
threads.z = 1;
poisson_vdz_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo);
CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick ");
cudaThreadSynchronize();
}
void poisson_energy(int nxlo_fft,int nxhi_fft,int nylo_fft,int nyhi_fft,int nzlo_fft,int nzhi_fft,int vflag)
void poisson_energy(int nxlo_fft, int nxhi_fft, int nylo_fft, int nyhi_fft, int nzlo_fft, int nzhi_fft, int vflag)
{
//printf("VFLAG_GPU: %i\n",vflag);
CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start ");
dim3 grid;
dim3 threads;
grid.x=nzhi_fft-nzlo_fft+1;
grid.y=nyhi_fft-nylo_fft+1;
grid.z=1;
threads.x=nxhi_fft-nxlo_fft+1;
threads.y=1;
threads.z=1;
poisson_energy_kernel<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(nxlo_fft,nylo_fft,nzlo_fft,vflag);
grid.x = nzhi_fft - nzlo_fft + 1;
grid.y = nyhi_fft - nylo_fft + 1;
grid.z = 1;
threads.x = nxhi_fft - nxlo_fft + 1;
threads.y = 1;
threads.z = 1;
poisson_energy_kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(nxlo_fft, nylo_fft, nzlo_fft, vflag);
cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end ");
}
ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppma,int nz_pppma,int vflag,ENERGY_FLOAT* cpu_virial)
ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_FLOAT* cpu_virial)
{
ENERGY_FLOAT host_energy=0;
ENERGY_FLOAT host_energy = 0;
dim3 grid;
dim3 threads;
grid.x=nz_pppma;
grid.y=1;
grid.z=1;
threads.x=ny_pppma;
threads.y=1;
threads.z=1;
sum_energy_kernel1<<<grid,threads,ny_pppma*sizeof(ENERGY_FLOAT)>>>(vflag);
grid.x = nz_pppma;
grid.y = 1;
grid.z = 1;
threads.x = ny_pppma;
threads.y = 1;
threads.z = 1;
sum_energy_kernel1 <<< grid, threads, ny_pppma* sizeof(ENERGY_FLOAT)>>>(vflag);
cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 ");
grid.x=1;
grid.y=1;
grid.z=1;
threads.x=nz_pppma;
threads.y=1;
threads.z=1;
sum_energy_kernel2<<<grid,threads,nz_pppma*sizeof(ENERGY_FLOAT)>>>(vflag);
grid.x = 1;
grid.y = 1;
grid.z = 1;
threads.x = nz_pppma;
threads.y = 1;
threads.z = 1;
sum_energy_kernel2 <<< grid, threads, nz_pppma* sizeof(ENERGY_FLOAT)>>>(vflag);
cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 ");
cudaMemcpy((void*) (&host_energy), cu_energy, sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost);
cudaMemcpy((void*)(&host_energy), cu_energy, sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
if(vflag)
cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost);
cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6 * sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy");
return host_energy;
}
void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_intScale,int ihi,int ilo,int jhi,int jlo,int khi,int klo,void* cu_density_brick,void* cu_density_brick_int)
void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_FLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int)
{
CUT_CHECK_ERROR("cuda_make_rho begin");
dim3 grid,threads;
dim3 grid, threads;
int cpu_flag[3];
grid.x=(sdata->atom.nlocal+31)/32;
grid.y=1;
grid.z=1;
threads.x=32;
threads.y=1;
threads.z=1;
int sharedmemsize=(32+32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT);
do
{
cpu_flag[0]=0;
cpu_flag[1]=0;
cpu_flag[2]=0;
cudaMemcpyToSymbol("density_intScale",cu_density_intScale,sizeof(PPPM_FLOAT*));
grid.x = (sdata->atom.nlocal + 31) / 32;
grid.y = 1;
grid.z = 1;
threads.x = 32;
threads.y = 1;
threads.z = 1;
int sharedmemsize = (32 + 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT);
do {
cpu_flag[0] = 0;
cpu_flag[1] = 0;
cpu_flag[2] = 0;
cudaMemcpyToSymbol("density_intScale", cu_density_intScale, sizeof(PPPM_FLOAT*));
CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z");
cudaMemset(flag,0,3*sizeof(int));
cudaMemset(flag, 0, 3 * sizeof(int));
CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A");
cudaMemset(cu_density_brick,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(PPPM_FLOAT));
cudaMemset(cu_density_brick, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(PPPM_FLOAT));
CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B");
cudaMemset(cu_density_brick_int,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(int));
cudaMemset(cu_density_brick_int, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(int));
CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C");
make_rho_kernel<<<grid,threads,sharedmemsize>>>((int*) flag,32/(sdata->pppm.nupper-sdata->pppm.nlower+1));
make_rho_kernel <<< grid, threads, sharedmemsize>>>((int*) flag, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1));
cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA make_rho A");
cudaMemcpy((void*) &cpu_flag, flag, 3*sizeof(int),cudaMemcpyDeviceToHost);
if(cpu_flag[0]!=0) {(*cu_density_intScale)/=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n",*cu_density_intScale);)}
if((cpu_flag[0]==0)&&(cpu_flag[1]==0)) {(*cu_density_intScale)*=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n",*cu_density_intScale);)}
/* if((*cu_density_intScale)>0xe0000000)
{
printf("Error Scaling\n");
cpu_flag[0]=0;
cpu_flag[1]=1;
}*/
cudaMemcpy((void*) &cpu_flag, flag, 3 * sizeof(int), cudaMemcpyDeviceToHost);
if(cpu_flag[0] != 0) {
(*cu_density_intScale) /= 2;
MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n", *cu_density_intScale);)
}
if((cpu_flag[0] == 0) && (cpu_flag[1] == 0)) {
(*cu_density_intScale) *= 2;
MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n", *cu_density_intScale);)
}
/* if((*cu_density_intScale)>0xe0000000)
{
printf("Error Scaling\n");
cpu_flag[0]=0;
cpu_flag[1]=1;
}*/
CUT_CHECK_ERROR("ERROR-CUDA make_rho B");
} while((cpu_flag[0]!=0)||(cpu_flag[1]==0));
grid.x=khi-klo+1;
grid.y=jhi-jlo+1;
threads.x=ihi-ilo+1;
scale_rho_kernel<<<grid,threads,0>>>();
} while((cpu_flag[0] != 0) || (cpu_flag[1] == 0));
grid.x = khi - klo + 1;
grid.y = jhi - jlo + 1;
threads.x = ihi - ilo + 1;
scale_rho_kernel <<< grid, threads, 0>>>();
cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale");
}
int cuda_particle_map(cuda_shared_data* sdata,void* flag)
int cuda_particle_map(cuda_shared_data* sdata, void* flag)
{
dim3 grid,threads;
dim3 grid, threads;
int cpu_flag;
grid.x=(sdata->atom.nlocal+31)/32;
grid.y=1;
grid.z=1;
threads.x=32;
threads.y=1;
threads.z=1;
grid.x = (sdata->atom.nlocal + 31) / 32;
grid.y = 1;
grid.z = 1;
threads.x = 32;
threads.y = 1;
threads.z = 1;
CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre");
particle_map_kernel<<<grid,threads,0>>>((int*) flag);
particle_map_kernel <<< grid, threads, 0>>>((int*) flag);
cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA particla_map a");
cudaMemcpy((void*) &cpu_flag, flag, sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy((void*) &cpu_flag, flag, sizeof(int), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("ERROR-CUDA particla_map b");
return cpu_flag;
}
void cuda_fieldforce(cuda_shared_data* sdata,void* flag)
void cuda_fieldforce(cuda_shared_data* sdata, void* flag)
{
dim3 grid,threads;
grid.x=(sdata->atom.nlocal+31)/32;
grid.y=1;
grid.z=1;
threads.x=32;
threads.y=1;
threads.z=1;
int sharedmemsize=(32+3*32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT);
fieldforce_kernel<<<grid,threads,sharedmemsize>>>
(sdata->pppm.nupper-sdata->pppm.nlower+1,32/(sdata->pppm.nupper-sdata->pppm.nlower+1),(int*) flag);
dim3 grid, threads;
grid.x = (sdata->atom.nlocal + 31) / 32;
grid.y = 1;
grid.z = 1;
threads.x = 32;
threads.y = 1;
threads.z = 1;
int sharedmemsize = (32 + 3 * 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT);
fieldforce_kernel <<< grid, threads, sharedmemsize>>>
(sdata->pppm.nupper - sdata->pppm.nlower + 1, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1), (int*) flag);
cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA fieldforce");
}
double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf)
{
dim3 grid,threads;
grid.x=(sdata->atom.nlocal+31)/32;
grid.y=1;
grid.z=1;
threads.x=32;
threads.y=1;
threads.z=1;
slabcorr_energy_kernel<<<grid,threads,32*sizeof(ENERGY_FLOAT)>>>(dev_buf);
dim3 grid, threads;
grid.x = (sdata->atom.nlocal + 31) / 32;
grid.y = 1;
grid.z = 1;
threads.x = 32;
threads.y = 1;
threads.z = 1;
slabcorr_energy_kernel <<< grid, threads, 32* sizeof(ENERGY_FLOAT)>>>(dev_buf);
cudaThreadSynchronize();
cudaMemcpy((void*) buf, dev_buf, grid.x*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost);
double dipole_all=0.0;
for(int i=0;i<grid.x;i++)
dipole_all+=buf[i];
cudaMemcpy((void*) buf, dev_buf, grid.x* sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
double dipole_all = 0.0;
for(int i = 0; i < grid.x; i++)
dipole_all += buf[i];
return dipole_all;
}
void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact)
{
dim3 grid,threads;
grid.x=(sdata->atom.nlocal+31)/32;
grid.y=1;
grid.z=1;
threads.x=32;
threads.y=1;
threads.z=1;
slabcorr_force_kernel<<<grid,threads>>>(ffact);
dim3 grid, threads;
grid.x = (sdata->atom.nlocal + 31) / 32;
grid.y = 1;
grid.z = 1;
threads.x = 32;
threads.y = 1;
threads.z = 1;
slabcorr_force_kernel <<< grid, threads>>>(ffact);
cudaThreadSynchronize();
}
@ -519,59 +528,59 @@ void sum_virial(double* host_virial)
{
}
void pppm_initfftdata(cuda_shared_data* sdata,PPPM_FLOAT* in,FFT_FLOAT* out)
void pppm_initfftdata(cuda_shared_data* sdata, PPPM_FLOAT* in, FFT_FLOAT* out)
{
int nslow=sdata->pppm.nzhi_in-sdata->pppm.nzlo_in;
int nmid=sdata->pppm.nyhi_in-sdata->pppm.nylo_in;
int nfast=sdata->pppm.nxhi_in-sdata->pppm.nxlo_in;
int nrimz=MAX(sdata->pppm.nzlo_in-sdata->pppm.nzlo_out,sdata->pppm.nzhi_out-sdata->pppm.nzhi_in);
int nrimy=MAX(sdata->pppm.nylo_in-sdata->pppm.nylo_out,sdata->pppm.nyhi_out-sdata->pppm.nyhi_in);
int nrimx=MAX(sdata->pppm.nxlo_in-sdata->pppm.nxlo_out,sdata->pppm.nxhi_out-sdata->pppm.nxhi_in);
int nslow = sdata->pppm.nzhi_in - sdata->pppm.nzlo_in;
int nmid = sdata->pppm.nyhi_in - sdata->pppm.nylo_in;
int nfast = sdata->pppm.nxhi_in - sdata->pppm.nxlo_in;
int nrimz = MAX(sdata->pppm.nzlo_in - sdata->pppm.nzlo_out, sdata->pppm.nzhi_out - sdata->pppm.nzhi_in);
int nrimy = MAX(sdata->pppm.nylo_in - sdata->pppm.nylo_out, sdata->pppm.nyhi_out - sdata->pppm.nyhi_in);
int nrimx = MAX(sdata->pppm.nxlo_in - sdata->pppm.nxlo_out, sdata->pppm.nxhi_out - sdata->pppm.nxhi_in);
dim3 grid;
grid.x=nslow+1;
grid.y=nmid+1;
grid.z=1;
grid.x = nslow + 1;
grid.y = nmid + 1;
grid.z = 1;
dim3 threads;
threads.x=nfast+1;
threads.y=1;
threads.z=1;
threads.x = nfast + 1;
threads.y = 1;
threads.z = 1;
cudaThreadSynchronize();
initfftdata_core_kernel<<<grid,threads,0>>>(in,out);
initfftdata_core_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize();
grid.x=nrimz;
grid.y=nmid+1;
threads.x=nfast+1;
initfftdata_z_kernel<<<grid,threads,0>>>(in,out);
grid.x = nrimz;
grid.y = nmid + 1;
threads.x = nfast + 1;
initfftdata_z_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize();
grid.x=nslow+1;
grid.y=nrimy;
threads.x=nfast+1;
initfftdata_y_kernel<<<grid,threads,0>>>(in,out);
grid.x = nslow + 1;
grid.y = nrimy;
threads.x = nfast + 1;
initfftdata_y_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize();
grid.x=nslow+1;
grid.y=nmid+1;
threads.x=nrimx;
initfftdata_x_kernel<<<grid,threads,0>>>(in,out);
grid.x = nslow + 1;
grid.y = nmid + 1;
threads.x = nrimx;
initfftdata_x_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize();
grid.x=nrimz;
grid.y=nrimy;
threads.x=nfast+1;
initfftdata_yz_kernel<<<grid,threads,0>>>(in,out);
grid.x = nrimz;
grid.y = nrimy;
threads.x = nfast + 1;
initfftdata_yz_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize();
grid.x=nrimz;
grid.y=nmid+1;
threads.x=nrimx;
initfftdata_xz_kernel<<<grid,threads,0>>>(in,out);
grid.x = nrimz;
grid.y = nmid + 1;
threads.x = nrimx;
initfftdata_xz_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize();
grid.x=nslow+1;
grid.y=nrimy;
threads.x=nrimx;
initfftdata_xy_kernel<<<grid,threads,0>>>(in,out);
grid.x = nslow + 1;
grid.y = nrimy;
threads.x = nrimx;
initfftdata_xy_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize();
grid.x=nrimz;
grid.y=nrimy;
threads.x=nrimx;
initfftdata_xyz_kernel<<<grid,threads,0>>>(in,out);
grid.x = nrimz;
grid.y = nrimy;
threads.x = nrimx;
initfftdata_xyz_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize();
CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel");
}