From 5a8719e38f33a86e730325e53706a74200e61ac1 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Thu, 26 May 2011 22:16:44 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6219 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/USER-CUDA/Install.sh | 74 ++ src/USER-CUDA/comm_cuda.cpp | 1431 +++++++++++++++++++++++++++++ src/USER-CUDA/comm_cuda.h | 69 ++ src/USER-CUDA/cuda.cpp | 837 +++++++++++++++++ src/USER-CUDA/cuda.h | 153 +++ src/USER-CUDA/cuda_common.h | 344 +++++++ src/USER-CUDA/cuda_data.h | 796 ++++++++++++++++ src/USER-CUDA/cuda_modify_flags.h | 39 + src/USER-CUDA/cuda_neigh_list.cpp | 180 ++++ src/USER-CUDA/cuda_neigh_list.h | 83 ++ src/USER-CUDA/cuda_precision.h | 269 ++++++ src/USER-CUDA/cuda_shared.h | 378 ++++++++ src/USER-CUDA/domain_cuda.cpp | 270 ++++++ src/USER-CUDA/domain_cuda.h | 41 + src/USER-CUDA/modify_cuda.cpp | 442 +++++++++ src/USER-CUDA/modify_cuda.h | 82 ++ src/USER-CUDA/neigh_full_cuda.cpp | 317 +++++++ src/USER-CUDA/neighbor_cuda.cpp | 221 +++++ src/USER-CUDA/neighbor_cuda.h | 39 + src/USER-CUDA/verlet_cuda.cpp | 1103 ++++++++++++++++++++++ src/USER-CUDA/verlet_cuda.h | 63 ++ 21 files changed, 7231 insertions(+) create mode 100755 src/USER-CUDA/Install.sh create mode 100644 src/USER-CUDA/comm_cuda.cpp create mode 100644 src/USER-CUDA/comm_cuda.h create mode 100644 src/USER-CUDA/cuda.cpp create mode 100644 src/USER-CUDA/cuda.h create mode 100644 src/USER-CUDA/cuda_common.h create mode 100644 src/USER-CUDA/cuda_data.h create mode 100644 src/USER-CUDA/cuda_modify_flags.h create mode 100644 src/USER-CUDA/cuda_neigh_list.cpp create mode 100644 src/USER-CUDA/cuda_neigh_list.h create mode 100644 src/USER-CUDA/cuda_precision.h create mode 100644 src/USER-CUDA/cuda_shared.h create mode 100644 src/USER-CUDA/domain_cuda.cpp create mode 100644 src/USER-CUDA/domain_cuda.h create mode 100644 src/USER-CUDA/modify_cuda.cpp create mode 100644 src/USER-CUDA/modify_cuda.h create mode 100644 src/USER-CUDA/neigh_full_cuda.cpp create mode 100644 src/USER-CUDA/neighbor_cuda.cpp create mode 100644 src/USER-CUDA/neighbor_cuda.h create mode 100644 src/USER-CUDA/verlet_cuda.cpp create mode 100644 src/USER-CUDA/verlet_cuda.h diff --git a/src/USER-CUDA/Install.sh b/src/USER-CUDA/Install.sh new file mode 100755 index 0000000000..c11e58b4f6 --- /dev/null +++ b/src/USER-CUDA/Install.sh @@ -0,0 +1,74 @@ +# Install/unInstall package files in LAMMPS +# edit Makefile.package to include/exclude CUDA library +# do not copy child files if parent does not exist + +if (test $1 = 1) then + + if (test -e ../Makefile.package) then + sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package + sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package + sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package + sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package + sed -i '1 i include ..\/..\/lib\/cuda\/Makefile.common' ../Makefile.package + sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include |' ../Makefile.package + sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) |' ../Makefile.package + sed -i -e 's|^PKG_LIB =[ \t]*|&-llammpscuda -lcuda -lcudart -lrt |' ../Makefile.package + fi + + cp comm_cuda.cpp .. + cp domain_cuda.cpp .. + cp modify_cuda.cpp .. + cp neighbor_cuda.cpp .. + cp neigh_full_cuda.cpp .. + cp verlet_cuda.cpp .. + + cp cuda.cpp .. + cp cuda_neigh_list.cpp .. + + cp comm_cuda.h .. + cp domain_cuda.h .. + cp modify_cuda.h .. + cp neighbor_cuda.h .. + cp verlet_cuda.h .. + + cp cuda.h .. + cp cuda_common.h .. + cp cuda_data.h .. + cp cuda_modify_flags.h .. + cp cuda_neigh_list.h .. + cp cuda_precision.h .. + cp cuda_shared.h .. + +elif (test $1 = 0) then + + if (test -e ../Makefile.package) then + sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package + sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package + sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package + sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package + fi + + rm ../comm_cuda.cpp + rm ../domain_cuda.cpp + rm ../modify_cuda.cpp + rm ../neighbor_cuda.cpp + rm ../neigh_full_cuda.cpp + rm ../verlet_cuda.cpp + + rm ../cuda.cpp + rm ../cuda_neigh_list.cpp + + rm ../comm_cuda.h + rm ../domain_cuda.h + rm ../modify_cuda.h + rm ../neighbor_cuda.h + rm ../verlet_cuda.h + + rm ../cuda.h + rm ../cuda_common.h + rm ../cuda_data.h + rm ../cuda_modify_flags.h + rm ../cuda_neigh_list.h + rm ../cuda_precision.h + rm ../cuda_shared.h +fi diff --git a/src/USER-CUDA/comm_cuda.cpp b/src/USER-CUDA/comm_cuda.cpp new file mode 100644 index 0000000000..6f90227112 --- /dev/null +++ b/src/USER-CUDA/comm_cuda.cpp @@ -0,0 +1,1431 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author (triclinic) : Pieter in 't Veld (SNL) +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include +#include +#include +#include +#include "comm_cuda.h" +#include "atom.h" +#include "atom_vec.h" +#include "force.h" +#include "pair.h" +#include "domain.h" +#include "neighbor.h" +#include "modify.h" +#include "fix.h" +#include "group.h" +#include "compute.h" +#include "cuda.h" +#include "error.h" +#include "memory.h" +#include "comm_cuda_cu.h" + +using namespace LAMMPS_NS; + +#define BUFFACTOR 1.5 +#define BUFMIN 1000 +#define BUFEXTRA 1000 + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#define BIG 1.0e20 + +enum{SINGLE,MULTI}; + +/* ---------------------------------------------------------------------- + setup MPI and allocate buffer space +------------------------------------------------------------------------- */ + +CommCuda::CommCuda(LAMMPS *lmp):Comm(lmp) +{ + cuda = lmp->cuda; + + cu_pbc=NULL; + cu_slablo=NULL; + cu_slabhi=NULL; + cu_multilo=NULL; + cu_multihi=NULL; + cu_sendlist=NULL; + + + memory->sfree(buf_send); + memory->sfree(buf_recv); + buf_send = NULL; + buf_recv = NULL; + + Comm::free_swap(); + allocate_swap(maxswap); +} + +/* ---------------------------------------------------------------------- */ + +CommCuda::~CommCuda() +{ + delete cu_sendlist; + if(cuda->pinned) + { + CudaWrapper_FreePinnedHostData((void*)buf_send); + CudaWrapper_FreePinnedHostData((void*)buf_recv); + } + else + { + memory->sfree(buf_send); + memory->sfree(buf_recv); + } + buf_send=NULL; + buf_recv=NULL; +} + +/* ---------------------------------------------------------------------- */ + +void CommCuda::init() +{ + int factor = 1; + if(cuda->shared_data.overlap_comm) factor=maxswap; + if(not buf_send) + grow_send(maxsend,0); + if(not buf_recv) + grow_recv(maxrecv); + if(not cu_sendlist) + { + cu_sendlist=new cCudaData ((int*)sendlist,maxswap,BUFMIN); + cuda->shared_data.comm.sendlist.dev_data=cu_sendlist->dev_data(); + cuda->shared_data.comm.maxswap=maxswap; + cuda->shared_data.comm.maxlistlength=BUFMIN; + cu_sendlist->upload(); + } + delete cu_pbc; + cu_pbc=new cCudaData ((int*)pbc,cuda->shared_data.comm.maxswap,6); + cu_pbc->upload(); + + delete cu_slablo; + cu_slablo = new cCudaData(slablo,cuda->shared_data.comm.maxswap); + cu_slablo->upload(); + + delete cu_slabhi; + cu_slabhi = new cCudaData(slabhi,cuda->shared_data.comm.maxswap); + cu_slabhi->upload(); + + cuda->shared_data.comm.pbc.dev_data=cu_pbc->dev_data(); + cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data(); + cuda->shared_data.comm.slabhi.dev_data=cu_slabhi->dev_data(); + + Comm::init(); +} + +/* ---------------------------------------------------------------------- + setup spatial-decomposition communication patterns + function of neighbor cutoff(s) & cutghostuser & current box size + single style sets slab boundaries (slablo,slabhi) based on max cutoff + multi style sets type-dependent slab boundaries (multilo,multihi) +------------------------------------------------------------------------- */ + +void CommCuda::setup() +{ + Comm::setup(); + + //upload changed geometry to device + if(style == SINGLE) + { + if(cu_slablo) cu_slablo->upload(); + if(cu_slabhi) cu_slabhi->upload(); + } + else + { + if(cu_multilo) cu_multilo->upload(); + if(cu_multihi) cu_multihi->upload(); + } +} + +/* ---------------------------------------------------------------------- + forward communication of atom coords every timestep + other per-atom attributes may also be sent via pack/unpack routines +------------------------------------------------------------------------- */ + +void CommCuda::forward_comm(int mode) +{ + if(mode==0) return forward_comm_cuda(); + if(mode==1) return forward_comm_pack_cuda(); + if(mode==2) return forward_comm_transfer_cuda(); + if(mode==3) return forward_comm_unpack_cuda(); +} + + +void CommCuda::forward_comm_cuda() +{ + static int count=0; + static double kerneltime=0.0; + static double copytime=0.0; + timespec time1,time2,time3; + + int n; + MPI_Request request; + MPI_Status status; + AtomVec *avec = atom->avec; + double **x = atom->x; + + cuda->shared_data.domain.xy=domain->xy; + cuda->shared_data.domain.xz=domain->xz; + cuda->shared_data.domain.yz=domain->yz; + cuda->shared_data.domain.prd[0]=domain->prd[0]; + cuda->shared_data.domain.prd[1]=domain->prd[1]; + cuda->shared_data.domain.prd[2]=domain->prd[2]; + cuda->shared_data.domain.triclinic=domain->triclinic; + if(not comm_x_only && not avec->cudable) + { + cuda->downloadAll(); + Comm::forward_comm(); + cuda->uploadAll(); + return; + } + + // exchange data with another proc + // if other proc is self, just copy + // if comm_x_only set, exchange or copy directly to x, don't unpack + + for (int iswap = 0; iswap < nswap; iswap++) { + if (sendproc[iswap] != me) + { + if (comm_x_only) + { + + int size_forward_recv_now=0; + + if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used + size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double); + else + size_forward_recv_now=size_forward_recv[iswap]; +clock_gettime(CLOCK_REALTIME,&time1); + + MPI_Irecv(buf_recv,size_forward_recv_now,MPI_DOUBLE, + recvproc[iswap],0,world,&request); + n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*) buf_send,pbc[iswap],pbc_flag[iswap]); + +clock_gettime(CLOCK_REALTIME,&time2); + + if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used + n=(n+1)*sizeof(X_FLOAT)/sizeof(double); + + //printf("RecvSize: %i SendSize: %i\n",size_forward_recv_now,n); + MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Wait(&request,&status); + +clock_gettime(CLOCK_REALTIME,&time3); +cuda->shared_data.cuda_timings.comm_forward_mpi_upper+= + time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000; +cuda->shared_data.cuda_timings.comm_forward_mpi_lower+= + time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000; + + Cuda_CommCuda_UnpackComm(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],(void*)buf_recv,iswap); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv] + + } + else if (ghost_velocity) + { + MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); + + if(avec->cudable) + n = avec->pack_comm_vel(sendnum[iswap],&iswap, + buf_send,pbc_flag[iswap],pbc[iswap]); + else + n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap], + buf_send,pbc_flag[iswap],pbc[iswap]); + + MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Wait(&request,&status); + avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_recv); + } + else + { + MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); + + if(avec->cudable) + n = avec->pack_comm(sendnum[iswap],&iswap, + buf_send,pbc_flag[iswap],pbc[iswap]); + else + n = avec->pack_comm(sendnum[iswap],sendlist[iswap], + buf_send,pbc_flag[iswap],pbc[iswap]); + + MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Wait(&request,&status); + avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv); + } + + } + else //sendproc == me + { + cuda->self_comm=1; + if (comm_x_only) + { + if (sendnum[iswap]) + { + n = Cuda_CommCuda_PackComm_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap],pbc[iswap],pbc_flag[iswap]); + if(n<0) error->all(" # CUDA ERRROR on PackComm_Self"); + if((sizeof(X_FLOAT)!=sizeof(double)) && n) + n=(n+1)*sizeof(X_FLOAT)/sizeof(double); + } + } + else if (ghost_velocity) + { + n = avec->pack_comm_vel(sendnum[iswap],&iswap, + (double*) firstrecv,pbc_flag[iswap],pbc[iswap]); + //avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],(double*) firstrecv); + } + else + { + n = avec->pack_comm(sendnum[iswap],&iswap, + (double*) firstrecv,pbc_flag[iswap],pbc[iswap]); + //avec->unpack_comm(recvnum[iswap],firstrecv[iswap],(double*) firstrecv); + } + cuda->self_comm=0; + } + } +} + +void CommCuda::forward_comm_pack_cuda() +{ + static int count=0; + static double kerneltime=0.0; + static double copytime=0.0; + timespec time1,time2,time3; + int n; // initialize comm buffers & exchange memory + + MPI_Request request; + MPI_Status status; + AtomVec *avec = atom->avec; + double **x = atom->x; + + cuda->shared_data.domain.xy=domain->xy; + cuda->shared_data.domain.xz=domain->xz; + cuda->shared_data.domain.yz=domain->yz; + cuda->shared_data.domain.prd[0]=domain->prd[0]; + cuda->shared_data.domain.prd[1]=domain->prd[1]; + cuda->shared_data.domain.prd[2]=domain->prd[2]; + cuda->shared_data.domain.triclinic=domain->triclinic; + if(not comm_x_only && not avec->cudable) cuda->downloadAll(); //if not comm_x_only the communication routine of the atom_vec style class is used + + // exchange data with another proc + // if other proc is self, just copy + // if comm_x_only set, exchange or copy directly to x, don't unpack + + for (int iswap = 0; iswap < nswap; iswap++) { + if (sendproc[iswap] != me) + { + if (comm_x_only) + { + + +clock_gettime(CLOCK_REALTIME,&time1); + + // n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*) cuda->shared_data.comm.buf_send[iswap],pbc[iswap],pbc_flag[iswap]); + n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*)buf_send,pbc[iswap],pbc_flag[iswap]); + +clock_gettime(CLOCK_REALTIME,&time2); + + if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used + n=(n+1)*sizeof(X_FLOAT)/sizeof(double); + cuda->shared_data.comm.send_size[iswap]=n; + } + else if (ghost_velocity) + { +clock_gettime(CLOCK_REALTIME,&time1); + + // n = Cuda_CommCuda_PackComm_Vel(&cuda->shared_data,sendnum[iswap],iswap,(void*) &buf_send[iswap*maxsend],pbc[iswap],pbc_flag[iswap]); + +clock_gettime(CLOCK_REALTIME,&time2); + + if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used + n=(n+1)*sizeof(X_FLOAT)/sizeof(double); + cuda->shared_data.comm.send_size[iswap]=n; + } + else + { + MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); + + if(avec->cudable) + n = avec->pack_comm(sendnum[iswap],&iswap, + cuda->shared_data.comm.buf_send[iswap],pbc_flag[iswap],pbc[iswap]); + else + n = avec->pack_comm(sendnum[iswap],sendlist[iswap], + cuda->shared_data.comm.buf_send[iswap],pbc_flag[iswap],pbc[iswap]); + + MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Wait(&request,&status); + avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv); + } + + } + else //sendproc == me + { + if (comm_x_only) + { + if (sendnum[iswap]) + { + n = Cuda_CommCuda_PackComm_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap],pbc[iswap],pbc_flag[iswap]); + if(n<0) error->all(" # CUDA ERRROR on PackComm_Self"); + if((sizeof(X_FLOAT)!=sizeof(double)) && n) + n=(n+1)*sizeof(X_FLOAT)/sizeof(double); + } + } + else if (ghost_velocity) + { + n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap], + buf_send,pbc_flag[iswap],pbc[iswap]); + avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_send); + } + else + { + n = avec->pack_comm(sendnum[iswap],sendlist[iswap], + buf_send,pbc_flag[iswap],pbc[iswap]); + avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send); + } + } + } + if(not comm_x_only && not avec->cudable) cuda->uploadAll(); +} + +void CommCuda::forward_comm_transfer_cuda() +{ + static int count=0; + static double kerneltime=0.0; + static double copytime=0.0; + timespec time1,time2,time3; + int n; + MPI_Request request; + MPI_Status status; + AtomVec *avec = atom->avec; + double **x = atom->x; + cuda->shared_data.domain.xy=domain->xy; + cuda->shared_data.domain.xz=domain->xz; + cuda->shared_data.domain.yz=domain->yz; + cuda->shared_data.domain.prd[0]=domain->prd[0]; + cuda->shared_data.domain.prd[1]=domain->prd[1]; + cuda->shared_data.domain.prd[2]=domain->prd[2]; + cuda->shared_data.domain.triclinic=domain->triclinic; + if(not comm_x_only && not avec->cudable) cuda->downloadAll(); //if not comm_x_only the communication routine of the atom_vec style class is used +//printf("A\n"); + // exchange data with another proc + // if other proc is self, just copy + // if comm_x_only set, exchange or copy directly to x, don't unpack + + for (int iswap = 0; iswap < nswap; iswap++) { + if (sendproc[iswap] != me) + { + if (comm_x_only) + { + + int size_forward_recv_now=0; + + if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used + size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double); + else + size_forward_recv_now=size_forward_recv[iswap]; + + //printf("A: %i \n",size_forward_recv_now/1024*4); + //MPI_Irecv(cuda->shared_data.comm.buf_recv[iswap],size_forward_recv_now,MPI_DOUBLE, + // recvproc[iswap],0,world,&request); + MPI_Irecv(buf_recv,size_forward_recv_now,MPI_DOUBLE, + recvproc[iswap],0,world,&request); + //printf("%p %p %i\n",buf_send, cuda->shared_data.comm.buf_send_dev[iswap], cuda->shared_data.comm.send_size[iswap]*sizeof(double)); + //memcpy(buf_send,cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap]*sizeof(double)); + // CudaWrapper_SyncStream(1); + //printf("B: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4); + CudaWrapper_DownloadCudaDataAsync((void*) buf_send, cuda->shared_data.comm.buf_send_dev[iswap], cuda->shared_data.comm.send_size[iswap]*sizeof(double),2); + //MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world); +clock_gettime(CLOCK_REALTIME,&time1); + CudaWrapper_SyncStream(2); + //printf("C: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4); +clock_gettime(CLOCK_REALTIME,&time2); +cuda->shared_data.cuda_timings.comm_forward_download+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + MPI_Send(buf_send,cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Wait(&request,&status); + //printf("D: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4); + CudaWrapper_UploadCudaDataAsync((void*) buf_recv,cuda->shared_data.comm.buf_recv_dev[iswap], size_forward_recv_now*sizeof(double),2); +clock_gettime(CLOCK_REALTIME,&time1); + CudaWrapper_SyncStream(2); + //printf("E: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4); + //memcpy(cuda->shared_data.comm.buf_recv[iswap],buf_recv,size_forward_recv_now*sizeof(double)); + //printf("RecvSize: %i SendSize: %i\n",size_forward_recv_now*sizeof(double),cuda->shared_data.comm.send_size[iswap]*sizeof(double)); +clock_gettime(CLOCK_REALTIME,&time3); +cuda->shared_data.cuda_timings.comm_forward_upload+= + time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000; +cuda->shared_data.cuda_timings.comm_forward_mpi_lower+= + time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000; +clock_gettime(CLOCK_REALTIME,&time3); +cuda->shared_data.cuda_timings.comm_forward_mpi_upper+= + time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000; + } + else if (ghost_velocity) + { + /* int size_forward_recv_now=0; + + if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used + size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double); + else + size_forward_recv_now=size_forward_recv[iswap]; + +clock_gettime(CLOCK_REALTIME,&time1); + + MPI_Irecv(cuda->shared_data.comm.buf_recv[iswap],size_forward_recv_now,MPI_DOUBLE, + recvproc[iswap],0,world,&request); + +clock_gettime(CLOCK_REALTIME,&time2); + + MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Wait(&request,&status); + +clock_gettime(CLOCK_REALTIME,&time3); +cuda->shared_data.cuda_timings.comm_forward_mpi_upper+= + time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000; +cuda->shared_data.cuda_timings.comm_forward_mpi_lower+= + time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000;*/ + + } + else + { + MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); + + if(avec->cudable) + n = avec->pack_comm(sendnum[iswap],&iswap, + buf_send,pbc_flag[iswap],pbc[iswap]); + else + n = avec->pack_comm(sendnum[iswap],sendlist[iswap], + buf_send,pbc_flag[iswap],pbc[iswap]); + + MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Wait(&request,&status); + avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv); + } + + } + else //sendproc == me + { + if (comm_x_only) + { + if (sendnum[iswap]) + { + } + } + else if (ghost_velocity) + { + } + else + { + n = avec->pack_comm(sendnum[iswap],sendlist[iswap], + buf_send,pbc_flag[iswap],pbc[iswap]); + avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send); + } + } + } + if(not comm_x_only && not avec->cudable) cuda->uploadAll(); +} + +void CommCuda::forward_comm_unpack_cuda() +{ + static int count=0; + static double kerneltime=0.0; + static double copytime=0.0; + timespec time1,time2,time3; + int n; + MPI_Request request; + MPI_Status status; + AtomVec *avec = atom->avec; + double **x = atom->x; + + cuda->shared_data.domain.xy=domain->xy; + cuda->shared_data.domain.xz=domain->xz; + cuda->shared_data.domain.yz=domain->yz; + cuda->shared_data.domain.prd[0]=domain->prd[0]; + cuda->shared_data.domain.prd[1]=domain->prd[1]; + cuda->shared_data.domain.prd[2]=domain->prd[2]; + cuda->shared_data.domain.triclinic=domain->triclinic; + if(not comm_x_only && not avec->cudable) cuda->downloadAll(); //if not comm_x_only the communication routine of the atom_vec style class is used + + // exchange data with another proc + // if other proc is self, just copy + // if comm_x_only set, exchange or copy directly to x, don't unpack + + for (int iswap = 0; iswap < nswap; iswap++) { + if (sendproc[iswap] != me) + { + if (comm_x_only) + { + + //Cuda_CommCuda_UnpackComm(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],cuda->shared_data.comm.buf_recv[iswap],iswap); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv] + Cuda_CommCuda_UnpackComm(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],buf_recv,iswap); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv] + + } + else if (ghost_velocity) + { + //Cuda_CommCuda_UnpackComm_Vel(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],(void*)&buf_recv[iswap*maxrecv]); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv] + } + else + { + MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); + + if(avec->cudable) + n = avec->pack_comm(sendnum[iswap],&iswap, + buf_send,pbc_flag[iswap],pbc[iswap]); + else + n = avec->pack_comm(sendnum[iswap],sendlist[iswap], + buf_send,pbc_flag[iswap],pbc[iswap]); + + MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Wait(&request,&status); + avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv); + } + + } + else //sendproc == me + { + if (comm_x_only) + { + if (sendnum[iswap]) + { + } + } + else if (ghost_velocity) + { + } + else + { + n = avec->pack_comm(sendnum[iswap],sendlist[iswap], + buf_send,pbc_flag[iswap],pbc[iswap]); + avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send); + } + } + } + if(not comm_x_only && not avec->cudable) cuda->uploadAll(); +} + +void CommCuda::forward_comm_pair(Pair *pair) +{ + if(not cuda->shared_data.pair.cudable_force) + { + return Comm::forward_comm_pair(pair); + } + + int iswap,n; + double *buf; + MPI_Request request; + MPI_Status status; + + for (iswap = 0; iswap < nswap; iswap++) { + + // pack buffer + + n = pair->pack_comm(sendnum[iswap],&iswap, + buf_send,pbc_flag[iswap],pbc[iswap]); + int nrecv = recvnum[iswap]*n; + if(nrecv<0) nrecv=-(nrecv+1)/2; + int nsend = sendnum[iswap]*n; + if(nsend<0) nsend=-(nsend+1)/2; + + // exchange with another proc + // if self, set recv buffer to send buffer + + if (sendproc[iswap] != me) { + MPI_Irecv(buf_recv,nrecv,MPI_DOUBLE,recvproc[iswap],0, + world,&request); + MPI_Send(buf_send,nsend,MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Wait(&request,&status); + buf = buf_recv; + } else buf = buf_send; + + // unpack buffer + + pair->unpack_comm(recvnum[iswap],firstrecv[iswap],buf); + } +} + +/* ---------------------------------------------------------------------- + reverse communication of forces on atoms every timestep + other per-atom attributes may also be sent via pack/unpack routines +------------------------------------------------------------------------- */ + +void CommCuda::reverse_comm() +{ + int n; + MPI_Request request; + MPI_Status status; + AtomVec *avec = atom->avec; + double **f = atom->f; + double *buf; + + if(not comm_f_only && not avec->cudable) cuda->downloadAll(); //not yet implemented in CUDA but only needed for non standard atom styles + + // exchange data with another proc + // if other proc is self, just copy + // if comm_f_only set, exchange or copy directly from f, don't pack + + for (int iswap = nswap-1; iswap >= 0; iswap--) { + if (sendproc[iswap] != me) { + if (comm_f_only) { + + int size_recv_now=size_reverse_recv[iswap]; + if((sizeof(F_FLOAT)!=sizeof(double))&& size_reverse_recv[iswap]) + size_recv_now=(size_recv_now+1)*sizeof(F_FLOAT)/sizeof(double); + MPI_Irecv(buf_recv,size_recv_now,MPI_DOUBLE, + sendproc[iswap],0,world,&request); + + buf=buf_send; + if (size_reverse_send[iswap]) + { + Cuda_CommCuda_PackReverse(&cuda->shared_data,size_reverse_send[iswap]/3,firstrecv[iswap],buf); + } + else buf=NULL; + int size_reverse_send_now=size_reverse_send[iswap]; + if((sizeof(F_FLOAT)!=sizeof(double))&& size_reverse_send[iswap]) + size_reverse_send_now=(size_reverse_send_now+1)*sizeof(F_FLOAT)/sizeof(double); + MPI_Send(buf,size_reverse_send_now,MPI_DOUBLE, + recvproc[iswap],0,world); + MPI_Wait(&request,&status); + Cuda_CommCuda_UnpackReverse(&cuda->shared_data,sendnum[iswap],iswap,buf_recv); + + } else { + MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE, + sendproc[iswap],0,world,&request); + n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send); + MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap],0,world); + MPI_Wait(&request,&status); + + avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_recv); + } + + } else { + if (comm_f_only) { + if (sendnum[iswap]) + Cuda_CommCuda_UnpackReverse_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap]); + } else { + n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send); + avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_send); + } + } + } + if(not comm_f_only && not avec->cudable) cuda->uploadAll(); //not yet implemented in CUDA but only needed for non standard atom styles +} + +/* ---------------------------------------------------------------------- + exchange: move atoms to correct processors + atoms exchanged with all 6 stencil neighbors + send out atoms that have left my box, receive ones entering my box + atoms will be lost if not inside some proc's box + can happen if atom moves outside of non-periodic bounary + or if atom moves more than one proc away + this routine called before every reneighboring + for triclinic, atoms must be in lamda coords (0-1) before exchange is called +------------------------------------------------------------------------- */ + +void CommCuda::exchange() +{ + AtomVec *avec = atom->avec; + + if(not cuda->oncpu && avec->cudable) + return exchange_cuda(); + + if(not cuda->oncpu) cuda->downloadAll(); + + Comm::exchange(); +} + + +void CommCuda::exchange_cuda() +{ + int i,m,nsend,nrecv,nrecv1,nrecv2,nlocal; + double lo,hi,value; + double **x; + double *sublo,*subhi,*buf; + MPI_Request request; + MPI_Status status; + AtomVec *avec = atom->avec; + timespec time1,time2,time3; + + // clear global->local map for owned and ghost atoms + // b/c atoms migrate to new procs in exchange() and + // new ghosts are created in borders() + // map_set() is done at end of borders() + + + if(map_style) cuda->cu_tag->download(); + + if (map_style) atom->map_clear(); + + // subbox bounds for orthogonal or triclinic + + if (triclinic == 0) { + sublo = domain->sublo; + subhi = domain->subhi; + } else { + sublo = domain->sublo_lamda; + subhi = domain->subhi_lamda; + } + + // loop over dimensions + + for (int dim = 0; dim < 3; dim++) { + // fill buffer with atoms leaving my box, using < and >= + // when atom is deleted, fill it in with last atom + + cuda->shared_data.exchange_dim=dim; + + nlocal = atom->nlocal; + avec->maxsend=&maxsend; + nsend=avec->pack_exchange(dim,(double*) &buf_send); + nlocal = atom->nlocal; + + + atom->nlocal = nlocal; + + // send/recv atoms in both directions + // if 1 proc in dimension, no send/recv, set recv buf to send buf + // if 2 procs in dimension, single send/recv + // if more than 2 procs in dimension, send/recv to both neighbors + + clock_gettime(CLOCK_REALTIME,&time1); + + if (procgrid[dim] == 1) { + nrecv = nsend; + buf = buf_send; + + } else { + MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0, + &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,&status); + nrecv = nrecv1; + if (procgrid[dim] > 2) { + MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0, + &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,&status); + nrecv += nrecv2; + } + if (nrecv+1 > maxrecv) grow_recv(nrecv+1); + + MPI_Irecv(buf_recv,nrecv1,MPI_DOUBLE,procneigh[dim][1],0, + world,&request); + MPI_Send(buf_send,nsend,MPI_DOUBLE,procneigh[dim][0],0,world); + MPI_Wait(&request,&status); + + if (procgrid[dim] > 2) { + MPI_Irecv(&buf_recv[nrecv1],nrecv2,MPI_DOUBLE,procneigh[dim][0],0, + world,&request); + MPI_Send(buf_send,nsend,MPI_DOUBLE,procneigh[dim][1],0,world); + MPI_Wait(&request,&status); + + if((nrecv1==0)||(nrecv2==0)) buf_recv[nrecv]=0; + } + + buf = buf_recv; + } + //printf("nsend: %i nrecv: %i\n",nsend,nrecv); + // check incoming atoms to see if they are in my box + // if so, add to my list +clock_gettime(CLOCK_REALTIME,&time2); +cuda->shared_data.cuda_timings.comm_exchange_mpi+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + if(nrecv) + { + avec->maxsend=&maxsend; + avec->unpack_exchange(buf); + } + } + + if(atom->firstgroupname) cuda->downloadAll(); + + if(atom->firstgroupname) atom->first_reorder(); + + if(atom->firstgroupname) cuda->uploadAll(); +} + +/* ---------------------------------------------------------------------- + borders: list nearby atoms to send to neighboring procs at every timestep + one list is created for every swap that will be made + as list is made, actually do swaps + this does equivalent of a communicate (so don't need to explicitly + call communicate routine on reneighboring timestep) + this routine is called before every reneighboring + for triclinic, atoms must be in lamda coords (0-1) before borders is called +------------------------------------------------------------------------- */ + + +void CommCuda::borders() +{ + AtomVec *avec = atom->avec; + if(not cuda->oncpu && avec->cudable) + { + if(cuda->shared_data.overlap_comm&&cuda->finished_setup) + borders_cuda_overlap_forward_comm(); + else + borders_cuda(); + + return; + } + + Comm::borders(); + + cuda->setSystemParams(); + if(cuda->finished_setup) {cuda->checkResize(); cuda->uploadAll();} + cuda->shared_data.atom.nghost=atom->nghost; + cu_sendlist->upload(); +} + +void CommCuda::borders_cuda() +{ + int i,n,itype,iswap,dim,ineed,maxneed,smax,rmax; + int nsend,nrecv,nfirst,nlast,ngroup; + double lo,hi; + int *type; + double **x; + double *buf,*mlo,*mhi; + MPI_Request request; + MPI_Status status; + AtomVec *avec = atom->avec; + timespec time1,time2,time3; + + // clear old ghosts + + atom->nghost = 0; + + // do swaps over all 3 dimensions + + iswap = 0; + smax = rmax = 0; + + cuda->shared_data.comm.nsend=0; + for (dim = 0; dim < 3; dim++) { + nlast = 0; + maxneed = 2*need[dim]; + for (ineed = 0; ineed < maxneed; ineed++) { + + // find atoms within slab boundaries lo/hi using <= and >= + // check atoms between nfirst and nlast + // for first swaps in a dim, check owned and ghost + // for later swaps in a dim, only check newly arrived ghosts + // store sent atom indices in list for use in future timesteps + + x = atom->x; + if (style == SINGLE) { + lo = slablo[iswap]; + hi = slabhi[iswap]; + } else { + type = atom->type; + mlo = multilo[iswap]; + mhi = multihi[iswap]; + } + if (ineed % 2 == 0) { + nfirst = nlast; + nlast = atom->nlocal + atom->nghost; + } + + nsend = 0; + + // find send atoms according to SINGLE vs MULTI + // all atoms eligible versus atoms in bordergroup + // only need to limit loop to bordergroup for first sends (ineed < 2) + // on these sends, break loop in two: owned (in group) and ghost + do + { + if(nsend>=maxsendlist[iswap]) grow_list(iswap,static_cast (nsend*1.05)); + nsend=Cuda_CommCuda_BuildSendlist(&cuda->shared_data,bordergroup,ineed,style==SINGLE?1:0,atom->nfirst,nfirst,nlast,dim,iswap); + }while(nsend>=maxsendlist[iswap]); + // pack up list of border atoms + + if (nsend*size_border > maxsend) + grow_send(nsend*size_border,0); + + if (ghost_velocity) + n = avec->pack_border_vel(nsend,&iswap,buf_send, + pbc_flag[iswap],pbc[iswap]); + else + n = avec->pack_border(nsend,&iswap,buf_send, + pbc_flag[iswap],pbc[iswap]); + + // swap atoms with other proc + // put incoming ghosts at end of my atom arrays + // if swapping with self, simply copy, no messages + +clock_gettime(CLOCK_REALTIME,&time1); + if (sendproc[iswap] != me) { + MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0, + &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status); + if (nrecv*size_border > maxrecv) + grow_recv(nrecv*size_border); + MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE, + recvproc[iswap],0,world,&request); + MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Wait(&request,&status); + buf = buf_recv; + } else { + nrecv = nsend; + buf = buf_send; + } + +clock_gettime(CLOCK_REALTIME,&time2); +cuda->shared_data.cuda_timings.comm_border_mpi+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + // unpack buffer + + if (ghost_velocity) + avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf); + else + avec->unpack_border(nrecv,atom->nlocal+atom->nghost,buf); + + // set all pointers & counters + + smax = MAX(smax,nsend); + rmax = MAX(rmax,nrecv); + sendnum[iswap] = nsend; + recvnum[iswap] = nrecv; + size_forward_recv[iswap] = nrecv*size_forward; + size_reverse_send[iswap] = nrecv*size_reverse; + size_reverse_recv[iswap] = nsend*size_reverse; + firstrecv[iswap] = atom->nlocal + atom->nghost; + atom->nghost += nrecv; + iswap++; + } + } + + // insure send/recv buffers are long enough for all forward & reverse comm + + int max = MAX(maxforward*smax,maxreverse*rmax); + if (max > maxsend) grow_send(max,0); + max = MAX(maxforward*rmax,maxreverse*smax); + if (max > maxrecv) grow_recv(max); + + // reset global->local map + if(map_style) + { + cuda->cu_tag->download(); + atom->map_set(); + } + + cuda->setSystemParams(); + cuda->shared_data.atom.nghost+=n; +} + +void CommCuda::borders_cuda_overlap_forward_comm() +{ + int i,n,itype,iswap,dim,ineed,maxneed,smax,rmax; + int nsend,nrecv,nfirst,nlast,ngroup; + double lo,hi; + int *type; + double **x; + double *buf,*mlo,*mhi; + MPI_Request request; + MPI_Status status; + AtomVec *avec = atom->avec; + timespec time1,time2,time3; + + // clear old ghosts + + atom->nghost = 0; + + // do swaps over all 3 dimensions + + iswap = 0; + smax = rmax = 0; + + cuda->shared_data.comm.nsend=0; + for (dim = 0; dim < 3; dim++) { + nlast = 0; + maxneed = 2*need[dim]; + for (ineed = 0; ineed < maxneed; ineed++) { + + // find atoms within slab boundaries lo/hi using <= and >= + // check atoms between nfirst and nlast + // for first swaps in a dim, check owned and ghost + // for later swaps in a dim, only check newly arrived ghosts + // store sent atom indices in list for use in future timesteps + + x = atom->x; + if (style == SINGLE) { + lo = slablo[iswap]; + hi = slabhi[iswap]; + } else { + type = atom->type; + mlo = multilo[iswap]; + mhi = multihi[iswap]; + } + if (ineed % 2 == 0) { + nfirst = nlast; + nlast = atom->nlocal + atom->nghost; + } + + nsend = 0; + + // find send atoms according to SINGLE vs MULTI + // all atoms eligible versus atoms in bordergroup + // only need to limit loop to bordergroup for first sends (ineed < 2) + // on these sends, break loop in two: owned (in group) and ghost + do + { + if(nsend>=maxsendlist[iswap]) grow_list(iswap,static_cast (nsend*1.05)); + nsend=Cuda_CommCuda_BuildSendlist(&cuda->shared_data,bordergroup,ineed,style==SINGLE?1:0,atom->nfirst,nfirst,nlast,dim,iswap); + }while(nsend>=maxsendlist[iswap]); + cuda->shared_data.comm.nsend_swap[iswap]=nsend; + // pack up list of border atoms + + if (nsend*size_border > maxsend) + grow_send(nsend*size_border,0); + + if (ghost_velocity) + n = avec->pack_border_vel(nsend,&iswap,buf_send, + pbc_flag[iswap],pbc[iswap]); + else + n = avec->pack_border(nsend,&iswap,buf_send, + pbc_flag[iswap],pbc[iswap]); + + // swap atoms with other proc + // put incoming ghosts at end of my atom arrays + // if swapping with self, simply copy, no messages + +clock_gettime(CLOCK_REALTIME,&time1); + if (sendproc[iswap] != me) { + MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0, + &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status); + if (nrecv*size_border > maxrecv) + grow_recv(nrecv*size_border); + MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE, + recvproc[iswap],0,world,&request); + MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Wait(&request,&status); + buf = buf_recv; + } else { + nrecv = nsend; + buf = buf_send; + } + +clock_gettime(CLOCK_REALTIME,&time2); +cuda->shared_data.cuda_timings.comm_border_mpi+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + // unpack buffer + + if (ghost_velocity) + avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf); + else + avec->unpack_border(nrecv,atom->nlocal+atom->nghost,buf); + + // set all pointers & counters + + smax = MAX(smax,nsend); + rmax = MAX(rmax,nrecv); + sendnum[iswap] = nsend; + recvnum[iswap] = nrecv; + size_forward_recv[iswap] = nrecv*size_forward; + size_reverse_send[iswap] = nrecv*size_reverse; + size_reverse_recv[iswap] = nsend*size_reverse; + firstrecv[iswap] = atom->nlocal + atom->nghost; + atom->nghost += nrecv; + iswap++; + } + } + + // insure send/recv buffers are long enough for all forward & reverse comm + + int max = MAX(maxforward*smax,maxreverse*rmax); + if (max > maxsend) grow_send(max,0); + max = MAX(maxforward*rmax,maxreverse*smax); + if (max > maxrecv) grow_recv(max); + + // reset global->local map + if(map_style) + { + cuda->cu_tag->download(); + atom->map_set(); + } + + cuda->setSystemParams(); + cuda->shared_data.atom.nghost+=n; +} + + + + +void CommCuda::forward_comm_fix(Fix *fix) +{ + int iswap,n; + double *buf; + MPI_Request request; + MPI_Status status; + + for (iswap = 0; iswap < nswap; iswap++) { + // pack buffer + if(fix->cudable_comm&&cuda->finished_setup) + { + int swap=iswap; + if(sendproc[iswap] == me) {swap=-iswap-1; buf=(double*)&(firstrecv[iswap]);} + else buf=buf_send; + + n = fix->pack_comm(sendnum[iswap],&swap, + buf,pbc_flag[iswap],pbc[iswap]); + if(sendproc[iswap] == me) + { + continue; + } + } + else + n = fix->pack_comm(sendnum[iswap],sendlist[iswap], + buf_send,pbc_flag[iswap],pbc[iswap]); + + // exchange with another proc + // if self, set recv buffer to send buffer + + if (sendproc[iswap] != me) { + MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0, + world,&request); + MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Wait(&request,&status); + buf = buf_recv; + } else buf = buf_send; + + // unpack buffer + + fix->unpack_comm(recvnum[iswap],firstrecv[iswap],buf); + } +} + + +void CommCuda::grow_send(int n, int flag) +{ + int oldmaxsend = (maxsend+BUFEXTRA)*sizeof(double); + maxsend = static_cast (BUFFACTOR * n); + if (flag){ + if(cuda->pinned) + { + double* tmp = new double[oldmaxsend]; + memcpy((void*) tmp,(void*) buf_send,oldmaxsend*sizeof(double)); + if(buf_send) CudaWrapper_FreePinnedHostData((void*) (buf_send)); + buf_send = (double*) CudaWrapper_AllocPinnedHostData((maxsend+BUFEXTRA)*sizeof(double),false); + memcpy(buf_send,tmp,oldmaxsend*sizeof(double)); + delete [] tmp; + } + else + { + buf_send = (double *) + memory->srealloc(buf_send,(maxsend+BUFEXTRA)*sizeof(double), + "comm:buf_send");printf("srealloc\n"); + } + } + else { + if(cuda->pinned) + { + if(buf_send) CudaWrapper_FreePinnedHostData((void*) buf_send); + buf_send = (double*) CudaWrapper_AllocPinnedHostData((maxsend+BUFEXTRA)*sizeof(double),false); + } + else + { + memory->sfree(buf_send); + buf_send = (double *) memory->smalloc((maxsend+BUFEXTRA)*sizeof(double), + "comm:buf_send"); + } + for(int i=0;ishared_data.comm.buf_send_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_send_dev[i],oldmaxsend); + cuda->shared_data.comm.buf_send_dev[i]=CudaWrapper_AllocCudaData((maxsend+BUFEXTRA)*sizeof(double)); + } + } +} +/* ---------------------------------------------------------------------- + free/malloc the size of the recv buffer as needed with BUFFACTOR +------------------------------------------------------------------------- */ + + +void CommCuda::grow_recv(int n) +{ + int oldmaxrecv = maxrecv*sizeof(double); + maxrecv = static_cast (BUFFACTOR * n); + if(cuda->pinned) + { + if(buf_recv) CudaWrapper_FreePinnedHostData((void*)buf_recv); + buf_recv = (double*) CudaWrapper_AllocPinnedHostData(maxrecv*sizeof(double), false,true); + } + else + { + memory->sfree(buf_recv); + buf_recv = (double *) memory->smalloc(maxrecv*sizeof(double), + "comm:buf_recv"); + } + for(int i=0;ishared_data.comm.buf_recv_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_recv_dev[i],oldmaxrecv); + cuda->shared_data.comm.buf_recv_dev[i]=CudaWrapper_AllocCudaData((maxrecv)*sizeof(double)); + } +} + +/* ---------------------------------------------------------------------- + realloc the size of the iswap sendlist as needed with BUFFACTOR +------------------------------------------------------------------------- */ + +void CommCuda::grow_list(int iswap, int n) +{ + + MYDBG(printf(" # CUDA CommCuda::grow_list\n");) + if(cuda->finished_setup&&cu_sendlist) cu_sendlist->download(); + if(!cu_sendlist||n*BUFFACTOR>cu_sendlist->get_dim()[1]||n*BUFFACTOR>maxsendlist[iswap]) + { + for(int i=0;i (BUFFACTOR * n); + sendlist[i] = (int *) + memory->srealloc(sendlist[i],maxsendlist[i]*sizeof(int), + "comm:sendlist[iswap]"); + } + delete cu_sendlist; + cu_sendlist=new cCudaData ((int*)sendlist,maxswap,maxsendlist[iswap]); + cuda->shared_data.comm.sendlist.dev_data=cu_sendlist->dev_data(); + cuda->shared_data.comm.maxlistlength=maxsendlist[iswap]; + cu_sendlist->upload(); + } + } + +/* ---------------------------------------------------------------------- + realloc the buffers needed for swaps +------------------------------------------------------------------------- */ + +void CommCuda::grow_swap(int n) +{ + int oldmaxswap=maxswap; + Comm::grow_swap(n); + if(n>cu_sendlist->get_dim()[0]) + { + MYDBG(printf(" # CUDA CommCuda::grow_swap\n");) + + delete cu_sendlist; + cu_sendlist=new cCudaData ((int*)sendlist,n,BUFMIN); + cuda->shared_data.comm.sendlist.dev_data=cu_sendlist->dev_data(); + cuda->shared_data.comm.maxlistlength=BUFMIN; + cuda->shared_data.comm.maxswap=n; + cuda->shared_data.comm.nsend_swap=new int[n]; + cuda->shared_data.comm.send_size=new int[n]; + cuda->shared_data.comm.recv_size=new int[n]; + } + for(int i=0;ishared_data.comm.buf_recv_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_recv_dev[i],maxrecv*sizeof(double)); + if(cuda->shared_data.comm.buf_send_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_send_dev[i],maxsend*sizeof(double)); + cuda->shared_data.comm.buf_recv_dev[i]=NULL; + cuda->shared_data.comm.buf_send_dev[i]=NULL; + } + cuda->shared_data.comm.buf_send= new double*[n]; + cuda->shared_data.comm.buf_recv= new double*[n]; + cuda->shared_data.comm.buf_send_dev= new void*[n]; + cuda->shared_data.comm.buf_recv_dev= new void*[n]; + for(int i=0;ishared_data.comm.buf_recv[i]=NULL; + cuda->shared_data.comm.buf_send[i]=NULL; + cuda->shared_data.comm.buf_recv_dev[i]=NULL; + cuda->shared_data.comm.buf_send_dev[i]=NULL; + } + grow_send(maxsend,0); + grow_recv(maxrecv); + + maxswap=n; +} + +/* ---------------------------------------------------------------------- + allocation of swap info +------------------------------------------------------------------------- */ + +void CommCuda::allocate_swap(int n) +{ + Comm::allocate_swap(n); + + delete cu_pbc; + delete cu_slablo; + delete cu_slabhi; + + cuda->shared_data.comm.maxswap=n; + if(cu_sendlist) + { + cu_pbc=new cCudaData ((int*)pbc,n,6); + cu_slablo = new cCudaData(slablo,n); + cu_slabhi = new cCudaData(slabhi,n); + + cuda->shared_data.comm.pbc.dev_data=cu_pbc->dev_data(); + cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data(); + cuda->shared_data.comm.slabhi.dev_data=cu_slabhi->dev_data(); + } + cuda->shared_data.comm.nsend_swap=new int[n]; + cuda->shared_data.comm.send_size=new int[n]; + cuda->shared_data.comm.recv_size=new int[n]; + cuda->shared_data.comm.buf_send= new double*[n]; + cuda->shared_data.comm.buf_recv= new double*[n]; + cuda->shared_data.comm.buf_send_dev= new void*[n]; + cuda->shared_data.comm.buf_recv_dev= new void*[n]; + for(int i=0;ishared_data.comm.buf_send_dev[i]=NULL; + for(int i=0;ishared_data.comm.buf_recv_dev[i]=NULL; +} + + +/* ---------------------------------------------------------------------- + allocation of multi-type swap info +------------------------------------------------------------------------- */ + +void CommCuda::allocate_multi(int n) +{ + Comm::allocate_multi(n); + + delete cu_multilo; + delete cu_multihi; + cu_multilo = new cCudaData(slablo,n,atom->ntypes+1); + cu_multihi = new cCudaData(slabhi,n,atom->ntypes+1); + + cuda->shared_data.comm.multilo.dev_data=cu_multilo->dev_data(); + cuda->shared_data.comm.multihi.dev_data=cu_multihi->dev_data(); +} + +/* ---------------------------------------------------------------------- + free memory for swaps +------------------------------------------------------------------------- */ + +void CommCuda::free_swap() +{ + + Comm::free_swap(); + + delete cuda->shared_data.comm.nsend_swap; cuda->shared_data.comm.nsend_swap=NULL; + delete cu_pbc; cu_pbc = NULL; + delete cu_slablo; cu_slablo = NULL; + delete cu_slabhi; cu_slabhi = NULL; + for(int i=0;ishared_data.comm.buf_recv_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_recv_dev[i],maxrecv*sizeof(double)); + if(cuda->shared_data.comm.buf_send_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_send_dev[i],maxsend*sizeof(double)); + } + + +} + +/* ---------------------------------------------------------------------- + free memory for multi-type swaps +------------------------------------------------------------------------- */ + +void CommCuda::free_multi() +{ + Comm::free_multi(); + delete cu_multilo; cu_multilo = NULL; + delete cu_multihi; cu_multihi = NULL; +} + diff --git a/src/USER-CUDA/comm_cuda.h b/src/USER-CUDA/comm_cuda.h new file mode 100644 index 0000000000..933d7364c1 --- /dev/null +++ b/src/USER-CUDA/comm_cuda.h @@ -0,0 +1,69 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_COMM_CUDA_H +#define LMP_COMM_CUDA_H + +#include "pointers.h" + +#include "cuda_data.h" +#include "comm.h" + +namespace LAMMPS_NS { + +class CommCuda : public Comm { +public: + CommCuda(class LAMMPS *); + ~CommCuda(); + + virtual void init(); + virtual void setup(); // setup 3d communication pattern + virtual void forward_comm(int mode=0); // forward communication of atom coords + virtual void forward_comm_cuda(); + virtual void forward_comm_pack_cuda(); + virtual void forward_comm_transfer_cuda(); + virtual void forward_comm_unpack_cuda(); + virtual void forward_comm_pair(Pair *pair); + virtual void reverse_comm(); // reverse communication of forces + virtual void exchange(); // move atoms to new procs + virtual void exchange_cuda(); // move atoms to new procs + virtual void borders(); // setup list of atoms to communicate + virtual void borders_cuda(); // setup list of atoms to communicate + virtual void borders_cuda_overlap_forward_comm(); + virtual void forward_comm_fix(class Fix *); // forward comm from a Fix + + + + + protected: + class Cuda *cuda; + cCudaData* cu_pbc; + cCudaData* cu_slablo; + cCudaData* cu_slabhi; + cCudaData* cu_multilo; + cCudaData* cu_multihi; + + cCudaData* cu_sendlist; + virtual void grow_send(int,int); // reallocate send buffer + virtual void grow_recv(int); // free/allocate recv buffer + virtual void grow_list(int, int); // reallocate one sendlist + virtual void grow_swap(int); // grow swap and multi arrays + virtual void allocate_swap(int); // allocate swap arrays + virtual void allocate_multi(int); // allocate multi arrays + virtual void free_swap(); // free swap arrays + virtual void free_multi(); // free multi arrays +}; + +} + +#endif diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp new file mode 100644 index 0000000000..f5ff1ea72d --- /dev/null +++ b/src/USER-CUDA/cuda.cpp @@ -0,0 +1,837 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "cuda.h" +#include "atom.h" +#include "domain.h" +#include "force.h" +#include "pair.h" +#include "update.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "universe.h" +#include "input.h" +#include "error.h" +#include "cuda_neigh_list.h" +//#include "pre_binning_cu.h" +#include "binning_cu.h" +//#include "reverse_binning_cu.h" +#include +#include +#include "cuda_pair_cu.h" +#include "cuda_cu.h" + +using namespace LAMMPS_NS; + +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +Cuda::Cuda(LAMMPS *lmp) : Pointers(lmp) +{ + cuda_exists=true; + lmp->cuda=this; + if(universe->me==0) + printf("# Using LAMMPS_CUDA \n"); + shared_data.me=universe->me; + device_set=false; + + Cuda_Cuda_GetCompileSettings(&shared_data); + + if(shared_data.compile_settings.prec_glob!=sizeof(CUDA_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: Global Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_glob, sizeof(CUDA_FLOAT)/4); + if(shared_data.compile_settings.prec_x!=sizeof(X_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: X Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_x, sizeof(X_FLOAT)/4); + if(shared_data.compile_settings.prec_v!=sizeof(V_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: V Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_v, sizeof(V_FLOAT)/4); + if(shared_data.compile_settings.prec_f!=sizeof(F_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: F Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_f, sizeof(F_FLOAT)/4); + if(shared_data.compile_settings.prec_pppm!=sizeof(PPPM_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: PPPM Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_pppm, sizeof(PPPM_FLOAT)/4); + if(shared_data.compile_settings.prec_fft!=sizeof(FFT_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: FFT Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_fft, sizeof(FFT_FLOAT)/4); + #ifdef FFT_CUFFT + if(shared_data.compile_settings.cufft!=1) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, 1); + #else + if(shared_data.compile_settings.cufft!=0) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, 0); + #endif + + if(shared_data.compile_settings.arch!=CUDA_ARCH) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: arch: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, CUDA_ARCH); + + cu_x = 0; + cu_v = 0; + cu_f = 0; + cu_tag = 0; + cu_type = 0; + cu_mask = 0; + cu_image = 0; + cu_xhold = 0; + cu_q = 0; + cu_rmass = 0; + cu_mass = 0; + cu_virial = 0; + cu_eatom = 0; + cu_vatom = 0; + cu_radius = 0; + cu_density = 0; + cu_omega = 0; + cu_torque = 0; + + cu_special = 0; + cu_nspecial = 0; + + cu_molecule = 0; + + cu_x_type = 0; + x_type = 0; + cu_v_radius = 0; + v_radius = 0; + cu_omega_rmass = 0; + omega_rmass = 0; + + binned_id = 0; + cu_binned_id = 0; + binned_idnew = 0; + cu_binned_idnew = 0; + + cu_map_array = 0; + + copy_buffer=0; + copy_buffersize=0; + + neighbor_decide_by_integrator=0; + pinned=true; + + debugdata=0; + new int[2*CUDA_MAX_DEBUG_SIZE]; + + finished_setup = false; + begin_setup = false; + finished_run = false; + + setSharedDataZero(); + + uploadtime=0; + downloadtime=0; + dotiming=false; + + dotestatom = false; + testatom = 0; + oncpu = true; + + self_comm = 0; + MYDBG( printf("# CUDA: Cuda::Cuda Done...\n");) + //cCudaData +} + +Cuda::~Cuda() +{ + + print_timings(); + + if(universe->me==0) printf("# CUDA: Free memory...\n"); + + delete cu_q; + delete cu_x; + delete cu_v; + delete cu_f; + delete cu_tag; + delete cu_type; + delete cu_mask; + delete cu_image; + delete cu_xhold; + delete cu_mass; + delete cu_rmass; + delete cu_virial; + delete cu_eng_vdwl; + delete cu_eng_coul; + delete cu_eatom; + delete cu_vatom; + delete cu_radius; + delete cu_density; + delete cu_omega; + delete cu_torque; + delete cu_molecule; + + delete cu_x_type; + delete [] x_type; + delete cu_v_radius; + delete [] v_radius; + delete cu_omega_rmass; + delete [] omega_rmass; + + delete cu_map_array; + + std::map::iterator p = neigh_lists.begin(); + while(p != neigh_lists.end()) + { + delete p->second; + ++p; + } +} + +void Cuda::accelerator(int narg, char** arg) +{ + if(device_set) return; + if(universe->me==0) + printf("# CUDA: Activate GPU \n"); + + int* devicelist=NULL; + int pppn=2; + for(int i=0;iall("Invalid Options for 'accelerator' command. Expecting a number or keyword 'special' after 'gpu/node' option."); + if(strcmp(arg[i],"special")==0) + { + if(++i==narg) + error->all("Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'."); + pppn=atoi(arg[i]); + if(pppn<1) error->all("Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'."); + if(i+pppn==narg) + error->all("Invalid Options for 'accelerator' command. Expecting list of device ids after keyword 'gpu/node special'."); + devicelist=new int[pppn]; + for(int k=0;kall("Invalid Options for 'accelerator' command. Expecting a number after 'pinned' option."); + pinned=atoi(arg[i])==0?false:true; + if((pinned==false)&&(universe->me==0)) printf(" #CUDA: Pinned memory is not used for communication\n"); + } + if(strcmp(arg[i],"dotiming")==0) + { + dotiming=true; + } + if(strcmp(arg[i],"suffix")==0) + { + if(++i==narg) + error->all("Invalid Options for 'accelerator' command. Expecting a string after 'suffix' option."); + strcpy(lmp->asuffix,arg[i]); + } + if(strcmp(arg[i],"overlap_comm")==0) + { + shared_data.overlap_comm=1; + } + if(strcmp(arg[i],"dotest")==0) + { + if(++i==narg) + error->all("Invalid Options for 'accelerator' command. Expecting a number after 'dotest' option."); + testatom=atof(arg[i]); + dotestatom=true; + } + if(strcmp(arg[i],"override_bpa")==0) + { + if(++i==narg) + error->all("Invalid Options for 'accelerator' command. Expecting a number after 'override_bpa' option."); + shared_data.pair.override_block_per_atom = atoi(arg[i]); + } + } + CudaWrapper_Init(0, (char**)0,universe->me,pppn,devicelist); + //if(shared_data.overlap_comm) + CudaWrapper_AddStreams(3); + cu_x = 0; + cu_v = 0; + cu_f = 0; + cu_tag = 0; + cu_type = 0; + cu_mask = 0; + cu_image = 0; + cu_xhold = 0; + cu_q = 0; + cu_rmass = 0; + cu_mass = 0; + cu_virial = 0; + cu_eatom = 0; + cu_vatom = 0; + cu_radius = 0; + cu_density = 0; + cu_omega = 0; + cu_torque = 0; + + cu_special = 0; + cu_nspecial = 0; + + cu_molecule = 0; + + cu_x_type = 0; + cu_v_radius = 0; + cu_omega_rmass = 0; + + cu_binned_id = 0; + cu_binned_idnew = 0; + device_set=true; + allocate(); + delete devicelist; +} + +void Cuda::setSharedDataZero() +{ + MYDBG(printf("# CUDA: Cuda::setSharedDataZero ...\n");) + shared_data.atom.nlocal = 0; + shared_data.atom.nghost = 0; + shared_data.atom.nall = 0; + shared_data.atom.nmax = 0; + shared_data.atom.ntypes = 0; + shared_data.atom.q_flag = 0; + shared_data.atom.need_eatom = 0; + shared_data.atom.need_vatom = 0; + + shared_data.pair.cudable_force = 0; + shared_data.pair.collect_forces_later = 0; + shared_data.pair.use_block_per_atom = 0; + shared_data.pair.override_block_per_atom = -1; + shared_data.pair.cut = 0; + shared_data.pair.cutsq = 0; + shared_data.pair.cut_inner = 0; + shared_data.pair.cut_coul = 0; + shared_data.pair.special_lj = 0; + shared_data.pair.special_coul = 0; + + + shared_data.pppm.cudable_force = 0; + + shared_data.buffersize = 0; + shared_data.buffer_new = 1; + shared_data.buffer = NULL; + + shared_data.comm.comm_phase=0; + shared_data.overlap_comm=0; + + shared_data.comm.buffer = NULL; + shared_data.comm.buffer_size=0; + shared_data.comm.overlap_split_ratio=0; + // setTimingsZero(); +} + +void Cuda::allocate() +{ + accelerator(0,NULL); + MYDBG(printf("# CUDA: Cuda::allocate ...\n");) + if(not cu_virial) + { + cu_virial = new cCudaData (NULL, & shared_data.pair.virial , 6); + cu_eng_vdwl = new cCudaData (NULL, & shared_data.pair.eng_vdwl ,1); + cu_eng_coul = new cCudaData (NULL, & shared_data.pair.eng_coul ,1); + cu_extent = new cCudaData (extent, 6); + shared_data.flag = CudaWrapper_AllocCudaData(sizeof(int)); + int size=2*CUDA_MAX_DEBUG_SIZE; + debugdata = new int[size]; + cu_debugdata = new cCudaData (debugdata , size); + shared_data.debugdata=cu_debugdata->dev_data(); + } + checkResize(); + setSystemParams(); + MYDBG(printf("# CUDA: Cuda::allocate done...\n");) +} + +void Cuda::setSystemParams() +{ + MYDBG(printf("# CUDA: Cuda::setSystemParams ...\n");) + shared_data.atom.nlocal = atom->nlocal; + shared_data.atom.nghost = atom->nghost; + shared_data.atom.nall = atom->nlocal + atom->nghost; + shared_data.atom.ntypes = atom->ntypes; + shared_data.atom.q_flag = atom->q_flag; + shared_data.atom.rmass_flag = atom->rmass_flag; + MYDBG(printf("# CUDA: Cuda::setSystemParams done ...\n");) +} + +void Cuda::setDomainParams() +{ + MYDBG(printf("# CUDA: Cuda::setDomainParams ...\n");) + cuda_shared_domain* cu_domain = &shared_data.domain; + + cu_domain->triclinic = domain->triclinic; + for(short i=0; i<3; ++i) + { + cu_domain->periodicity[i] = domain->periodicity[i]; + cu_domain->sublo[i] = domain->sublo[i]; + cu_domain->subhi[i] = domain->subhi[i]; + cu_domain->boxlo[i] = domain->boxlo[i]; + cu_domain->boxhi[i] = domain->boxhi[i]; + cu_domain->prd[i] = domain->prd[i]; + } + if(domain->triclinic) + { + for(short i=0; i<3; ++i) + { + cu_domain->boxlo_lamda[i] = domain->boxlo_lamda[i]; + cu_domain->boxhi_lamda[i] = domain->boxhi_lamda[i]; + cu_domain->prd_lamda[i] = domain->prd_lamda[i]; + } + cu_domain->xy = domain->xy; + cu_domain->xz = domain->xz; + cu_domain->yz = domain->yz; + } + + for(int i=0;i<6;i++) + { + cu_domain->h[i]=domain->h[i]; + cu_domain->h_inv[i]=domain->h_inv[i]; + cu_domain->h_rate[i]=domain->h_rate[i]; + } + + cu_domain->update=2; + MYDBG(printf("# CUDA: Cuda::setDomainParams done ...\n");) +} + +void Cuda::checkResize() +{ + MYDBG(printf("# CUDA: Cuda::checkResize ...\n");) + accelerator(0,NULL); + cuda_shared_atom* cu_atom = & shared_data.atom; + cuda_shared_pair* cu_pair = & shared_data.pair; + cu_atom->q_flag = atom->q_flag; + cu_atom->rmass_flag = atom->rmass ? 1 : 0; + cu_atom->nall = atom->nlocal + atom->nghost; + cu_atom->nlocal = atom->nlocal; + cu_atom->nghost = atom->nghost; + + // do we have more atoms to upload than currently allocated memory on device? (also true if nothing yet allocated) + if(atom->nmax > cu_atom->nmax || cu_tag == NULL) + { + delete cu_x; cu_x = new cCudaData ((double*)atom->x , & cu_atom->x , atom->nmax, 3,0,true); //cu_x->set_buffer(&(shared_data.buffer),&(shared_data.buffersize),true); + delete cu_v; cu_v = new cCudaData ((double*)atom->v, & cu_atom->v , atom->nmax, 3); + delete cu_f; cu_f = new cCudaData ((double*)atom->f, & cu_atom->f , atom->nmax, 3,0,true); + delete cu_tag; cu_tag = new cCudaData (atom->tag , & cu_atom->tag , atom->nmax ); + delete cu_type; cu_type = new cCudaData (atom->type , & cu_atom->type , atom->nmax ); + delete cu_mask; cu_mask = new cCudaData (atom->mask , & cu_atom->mask , atom->nmax ); + delete cu_image; cu_image = new cCudaData (atom->image , & cu_atom->image , atom->nmax ); + + if(atom->rmass) + {delete cu_rmass; cu_rmass = new cCudaData (atom->rmass , & cu_atom->rmass , atom->nmax );} + + if(cu_atom->q_flag) + {delete cu_q; cu_q = new cCudaData ((double*)atom->q, & cu_atom->q , atom->nmax );}// cu_q->set_buffer(&(copy_buffer),&(copy_buffersize),true);} + +/* + if(force->pair) + if(force->pair->eatom) + {delete cu_eatom; cu_eatom = new cCudaData (force->pair->eatom, & cu_atom->eatom , atom->nmax );}// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);} + if(force->pair) + if(force->pair->vatom) + {delete cu_vatom; cu_vatom = new cCudaData ((double*)force->pair->vatom, & cu_atom->vatom , atom->nmax,6 );}// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);} +*/ + if(atom->radius) + { + delete cu_radius; cu_radius = new cCudaData (atom->radius , & cu_atom->radius , atom->nmax ); + delete cu_v_radius; cu_v_radius = new cCudaData (v_radius , & cu_atom->v_radius , atom->nmax*4); + delete cu_omega_rmass; cu_omega_rmass = new cCudaData (omega_rmass , & cu_atom->omega_rmass , atom->nmax*4); + } + + /* + if(atom->density) + {delete cu_density; cu_density = new cCudaData (atom->density , & cu_atom->density , atom->nmax );} + */ + + if(atom->omega) + {delete cu_omega; cu_omega = new cCudaData (((double*) atom->omega) , & cu_atom->omega , atom->nmax,3 );} + + if(atom->torque) + {delete cu_torque; cu_torque = new cCudaData (((double*) atom->torque) , & cu_atom->torque , atom->nmax,3 );} + + if(atom->special) + {delete cu_special; cu_special = new cCudaData (((int*) &(atom->special[0][0])) , & cu_atom->special , atom->nmax,atom->maxspecial ); shared_data.atom.maxspecial=atom->maxspecial;} + if(atom->nspecial) + {delete cu_nspecial; cu_nspecial = new cCudaData (((int*) atom->nspecial) , & cu_atom->nspecial , atom->nmax,3 );} + if(atom->molecule) + {delete cu_molecule; cu_molecule = new cCudaData (((int*) atom->molecule) , & cu_atom->molecule , atom->nmax );} + shared_data.atom.special_flag = neighbor->special_flag; + shared_data.atom.molecular = atom->molecular; + + cu_atom->update_nmax = 2; + cu_atom->nmax = atom->nmax; + + //delete [] x_type; x_type = new X_FLOAT4[atom->nmax]; + delete cu_x_type; cu_x_type = new cCudaData (x_type , & cu_atom->x_type , atom->nmax*4); + // shared_data.buffer_new = 2; + } + + if(((cu_xhold==NULL)||(cu_xhold->get_dim()[0]maxhold))&&neighbor->xhold) + { + delete cu_xhold; cu_xhold = new cCudaData ((double*)neighbor->xhold, & cu_atom->xhold , neighbor->maxhold, 3); + shared_data.atom.maxhold=neighbor->maxhold; + } + + if(atom->mass && !cu_mass) + {cu_mass = new cCudaData (atom->mass , & cu_atom->mass , atom->ntypes+1);} + cu_atom->mass_host = atom->mass; + + if(atom->map_style==1) + { + if((cu_map_array==NULL)) + { + cu_map_array = new cCudaData (atom->get_map_array() , & cu_atom->map_array , atom->get_map_size() ); + } + } + + + // if any of the host pointers have changed (e.g. re-allocated somewhere else), set to correct pointer + if(cu_x ->get_host_data() != atom->x) cu_x ->set_host_data((double*) (atom->x)); + if(cu_v ->get_host_data() != atom->v) cu_v ->set_host_data((double*) (atom->v)); + if(cu_f ->get_host_data() != atom->f) cu_f ->set_host_data((double*) (atom->f)); + if(cu_tag ->get_host_data() != atom->tag) cu_tag ->set_host_data(atom->tag); + if(cu_type->get_host_data() != atom->type) cu_type->set_host_data(atom->type); + if(cu_mask->get_host_data() != atom->mask) cu_mask->set_host_data(atom->mask); + if(cu_image->get_host_data() != atom->image) cu_mask->set_host_data(atom->image); + + if(cu_xhold) + if(cu_xhold->get_host_data()!= neighbor->xhold) cu_xhold->set_host_data((double*)(neighbor->xhold)); + + if(atom->rmass) + if(cu_rmass->get_host_data() != atom->rmass) cu_rmass->set_host_data((double*) (atom->rmass)); + + if(cu_atom->q_flag) + if(cu_q->get_host_data() != atom->q) cu_q->set_host_data((double*) (atom->q)); + + if(atom->radius) + if(cu_radius->get_host_data() != atom->radius) cu_radius->set_host_data((double*) (atom->radius)); + + /* + if(atom->density) + if(cu_density->get_host_data() != atom->density) cu_density->set_host_data((double*) (atom->density)); + */ + + if(atom->omega) + if(cu_omega->get_host_data() != atom->omega) cu_omega->set_host_data((double*) (atom->omega)); + + if(atom->torque) + if(cu_torque->get_host_data() != atom->torque) cu_torque->set_host_data((double*) (atom->torque)); + + if(atom->special) + if(cu_special->get_host_data() != atom->special) + {delete cu_special; cu_special = new cCudaData (((int*) atom->special) , & cu_atom->special , atom->nmax,atom->maxspecial ); shared_data.atom.maxspecial=atom->maxspecial;} + + if(atom->nspecial) + if(cu_nspecial->get_host_data() != atom->nspecial) cu_nspecial->set_host_data((int*) (atom->nspecial)); + + if(atom->molecule) + if(cu_molecule->get_host_data() != atom->molecule) cu_molecule->set_host_data((int*) (atom->molecule)); + + if(force) + if(cu_virial ->get_host_data() != force->pair->virial) cu_virial ->set_host_data(force->pair->virial); + if(force) + if(cu_eng_vdwl ->get_host_data() != &force->pair->eng_vdwl) cu_eng_vdwl ->set_host_data(&force->pair->eng_vdwl); + if(force) + if(cu_eng_coul ->get_host_data() != &force->pair->eng_coul) cu_eng_coul ->set_host_data(&force->pair->eng_coul); + + cu_atom->update_nlocal = 2; + MYDBG(printf("# CUDA: Cuda::checkResize done...\n");) +} + +void Cuda::evsetup_eatom_vatom(int eflag_atom,int vflag_atom) +{ + if(eflag_atom) + { + if(not cu_eatom) + cu_eatom = new cCudaData (force->pair->eatom, & (shared_data.atom.eatom) , atom->nmax );// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);} + cu_eatom->set_host_data(force->pair->eatom); + cu_eatom->memset_device(0); + } + if(vflag_atom) + { + if(not cu_vatom) + cu_vatom = new cCudaData ((double*)force->pair->vatom, & (shared_data.atom.vatom) , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);} + cu_vatom->set_host_data((double*)force->pair->vatom); + cu_vatom->memset_device(0); + } +} + +void Cuda::uploadAll() +{ + MYDBG(printf("# CUDA: Cuda::uploadAll() ... start\n");) + timespec starttime; + timespec endtime; + + if(atom->nmax!=shared_data.atom.nmax) checkResize(); + clock_gettime(CLOCK_REALTIME,&starttime); + cu_x ->upload(); + cu_v ->upload(); + cu_f ->upload(); + cu_tag ->upload(); + cu_type->upload(); + cu_mask->upload(); + cu_image->upload(); + if(shared_data.atom.q_flag) cu_q ->upload(); + + //printf("A3\n"); + //if(shared_data.atom.need_eatom) cu_eatom->upload(); + //printf("A4\n"); + //if(shared_data.atom.need_vatom) cu_vatom->upload(); + //printf("A5\n"); + + if(atom->rmass) cu_rmass->upload(); + + if(atom->radius) cu_radius->upload(); + // if(atom->density) cu_density->upload(); + if(atom->omega) cu_omega->upload(); + if(atom->torque) cu_torque->upload(); + if(atom->special) cu_special->upload(); + if(atom->nspecial) cu_nspecial->upload(); + if(atom->molecule) cu_molecule->upload(); + if(cu_eatom) cu_eatom->upload(); + if(cu_vatom) cu_vatom->upload(); + + clock_gettime(CLOCK_REALTIME,&endtime); + uploadtime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); + CUDA_IF_BINNING(Cuda_PreBinning(& shared_data);) + CUDA_IF_BINNING(Cuda_Binning (& shared_data);) + + shared_data.atom.triggerneighsq=neighbor->triggersq; + MYDBG(printf("# CUDA: Cuda::uploadAll() ... end\n");) +} + +void Cuda::downloadAll() +{ + MYDBG(printf("# CUDA: Cuda::downloadAll() ... start\n");) + timespec starttime; + timespec endtime; + + if(atom->nmax!=shared_data.atom.nmax) checkResize(); + + CUDA_IF_BINNING( Cuda_ReverseBinning(& shared_data); ) + clock_gettime(CLOCK_REALTIME,&starttime); + cu_x ->download(); + cu_v ->download(); + cu_f ->download(); + cu_type->download(); + cu_tag ->download(); + cu_mask->download(); + cu_image->download(); + + //if(shared_data.atom.need_eatom) cu_eatom->download(); + //if(shared_data.atom.need_vatom) cu_vatom->download(); + + if(shared_data.atom.q_flag) cu_q ->download(); + if(atom->rmass) cu_rmass->download(); + + if(atom->radius) cu_radius->download(); + // if(atom->density) cu_density->download(); + if(atom->omega) cu_omega->download(); + if(atom->torque) cu_torque->download(); + if(atom->special) cu_special->download(); + if(atom->nspecial) cu_nspecial->download(); + if(atom->molecule) cu_molecule->download(); + if(cu_eatom) cu_eatom->download(); + if(cu_vatom) cu_vatom->download(); + + clock_gettime(CLOCK_REALTIME,&endtime); + downloadtime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); + MYDBG(printf("# CUDA: Cuda::downloadAll() ... end\n");) +} + +void Cuda::downloadX() +{ + Cuda_Pair_RevertXType(& this->shared_data); + cu_x->download(); +} + +CudaNeighList* Cuda::registerNeighborList(class NeighList* neigh_list) +{ + MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... start a\n");) + std::map::iterator p = neigh_lists.find(neigh_list); + + if(p != neigh_lists.end()) return p->second; + else + { + CudaNeighList* neigh_list_cuda = new CudaNeighList(lmp, neigh_list); + neigh_lists.insert(std::pair(neigh_list, neigh_list_cuda)); + return neigh_list_cuda; + } + MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... end b\n");) +} + +void Cuda::uploadAllNeighborLists() +{ + MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... start\n");) + std::map::iterator p = neigh_lists.begin(); + while(p != neigh_lists.end()) + { + p->second->nl_upload(); + if(not (p->second->neigh_list->cuda_list->build_cuda)) + for(int i=0;inlocal;i++) + p->second->sneighlist.maxneighbors=MAX(p->second->neigh_list->numneigh[i],p->second->sneighlist.maxneighbors) ; + ++p; + } + MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... done\n");) +} + +void Cuda::downloadAllNeighborLists() +{ + MYDBG(printf("# CUDA: Cuda::downloadAllNeighborList() ... start\n");) + std::map::iterator p = neigh_lists.begin(); + while(p != neigh_lists.end()) + { + p->second->nl_download(); + ++p; + } +} + +void Cuda::update_xhold(int &maxhold,double* xhold) +{ + if(this->shared_data.atom.maxholdnmax) + { + maxhold = atom->nmax; + delete this->cu_xhold; this->cu_xhold = new cCudaData ((double*)xhold, & this->shared_data.atom.xhold , maxhold, 3); + } + this->shared_data.atom.maxhold=maxhold; + CudaWrapper_CopyData(this->cu_xhold->dev_data(),this->cu_x->dev_data(),3*atom->nmax*sizeof(X_FLOAT)); +} + +void Cuda::setTimingsZero() +{ + shared_data.cuda_timings.test1=0; + shared_data.cuda_timings.test2=0; + + //communication + shared_data.cuda_timings.comm_forward_total = 0; + shared_data.cuda_timings.comm_forward_mpi_upper = 0; + shared_data.cuda_timings.comm_forward_mpi_lower = 0; + shared_data.cuda_timings.comm_forward_kernel_pack = 0; + shared_data.cuda_timings.comm_forward_kernel_unpack = 0; + shared_data.cuda_timings.comm_forward_upload = 0; + shared_data.cuda_timings.comm_forward_download = 0; + + shared_data.cuda_timings.comm_exchange_total = 0; + shared_data.cuda_timings.comm_exchange_mpi = 0; + shared_data.cuda_timings.comm_exchange_kernel_pack = 0; + shared_data.cuda_timings.comm_exchange_kernel_unpack = 0; + shared_data.cuda_timings.comm_exchange_kernel_fill = 0; + shared_data.cuda_timings.comm_exchange_cpu_pack= 0; + shared_data.cuda_timings.comm_exchange_upload = 0; + shared_data.cuda_timings.comm_exchange_download = 0; + + shared_data.cuda_timings.comm_border_total = 0; + shared_data.cuda_timings.comm_border_mpi = 0; + shared_data.cuda_timings.comm_border_kernel_pack = 0; + shared_data.cuda_timings.comm_border_kernel_unpack = 0; + shared_data.cuda_timings.comm_border_kernel_buildlist = 0; + shared_data.cuda_timings.comm_border_kernel_self = 0; + shared_data.cuda_timings.comm_border_upload = 0; + shared_data.cuda_timings.comm_border_download = 0; + + //pair forces + shared_data.cuda_timings.pair_xtype_conversion = 0; + shared_data.cuda_timings.pair_kernel = 0; + shared_data.cuda_timings.pair_virial = 0; + shared_data.cuda_timings.pair_force_collection = 0; + + //neighbor + shared_data.cuda_timings.neigh_bin = 0; + shared_data.cuda_timings.neigh_build = 0; + shared_data.cuda_timings.neigh_special = 0; + + //PPPM + shared_data.cuda_timings.pppm_particle_map; + shared_data.cuda_timings.pppm_make_rho; + shared_data.cuda_timings.pppm_brick2fft; + shared_data.cuda_timings.pppm_poisson; + shared_data.cuda_timings.pppm_fillbrick; + shared_data.cuda_timings.pppm_fieldforce; + shared_data.cuda_timings.pppm_compute; + + CudaWrapper_CheckUploadTime(true); + CudaWrapper_CheckDownloadTime(true); + CudaWrapper_CheckCPUBufUploadTime(true); + CudaWrapper_CheckCPUBufDownloadTime(true); +} + +void Cuda::print_timings() +{ + if(universe->me!=0) return; + if(not dotiming) return; + printf("\n # CUDA: Special timings\n\n"); + printf("\n Transfer Times\n"); + printf(" PCIe Upload: \t %lf s\n",CudaWrapper_CheckUploadTime()); + printf(" PCIe Download:\t %lf s\n",CudaWrapper_CheckDownloadTime()); + printf(" CPU Tempbbuf Upload: \t %lf \n",CudaWrapper_CheckCPUBufUploadTime()); + printf(" CPU Tempbbuf Download: \t %lf \n",CudaWrapper_CheckCPUBufDownloadTime()); + + printf("\n Communication \n"); + + printf(" Forward Total \t %lf \n",shared_data.cuda_timings.comm_forward_total); + printf(" Forward MPI Upper Bound \t %lf \n",shared_data.cuda_timings.comm_forward_mpi_upper); + printf(" Forward MPI Lower Bound \t %lf \n",shared_data.cuda_timings.comm_forward_mpi_lower); + printf(" Forward Kernel Pack \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_pack); + printf(" Forward Kernel Unpack \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_unpack); + printf(" Forward Kernel Self \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_self); + printf(" Forward Upload \t %lf \n",shared_data.cuda_timings.comm_forward_upload); + printf(" Forward Download \t %lf \n",shared_data.cuda_timings.comm_forward_download); + printf(" Forward Overlap Split Ratio\t %lf \n",shared_data.comm.overlap_split_ratio); + printf("\n"); + + printf(" Exchange Total \t %lf \n",shared_data.cuda_timings.comm_exchange_total); + printf(" Exchange MPI \t %lf \n",shared_data.cuda_timings.comm_exchange_mpi); + printf(" Exchange Kernel Pack \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_pack); + printf(" Exchange Kernel Unpack \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_unpack); + printf(" Exchange Kernel Fill \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_fill); + printf(" Exchange CPU Pack \t %lf \n",shared_data.cuda_timings.comm_exchange_cpu_pack); + printf(" Exchange Upload \t %lf \n",shared_data.cuda_timings.comm_exchange_upload); + printf(" Exchange Download \t %lf \n",shared_data.cuda_timings.comm_exchange_download); + printf("\n"); + + printf(" Border Total \t %lf \n",shared_data.cuda_timings.comm_border_total); + printf(" Border MPI \t %lf \n",shared_data.cuda_timings.comm_border_mpi); + printf(" Border Kernel Pack \t %lf \n",shared_data.cuda_timings.comm_border_kernel_pack); + printf(" Border Kernel Unpack \t %lf \n",shared_data.cuda_timings.comm_border_kernel_unpack); + printf(" Border Kernel Self \t %lf \n",shared_data.cuda_timings.comm_border_kernel_self); + printf(" Border Kernel BuildList \t %lf \n",shared_data.cuda_timings.comm_border_kernel_buildlist); + printf(" Border Upload \t %lf \n",shared_data.cuda_timings.comm_border_upload); + printf(" Border Download \t %lf \n",shared_data.cuda_timings.comm_border_download); + printf("\n"); + + //pair forces + printf(" Pair XType Conversion \t %lf \n",shared_data.cuda_timings.pair_xtype_conversion ); + printf(" Pair Kernel \t %lf \n",shared_data.cuda_timings.pair_kernel ); + printf(" Pair Virial \t %lf \n",shared_data.cuda_timings.pair_virial ); + printf(" Pair Force Collection \t %lf \n",shared_data.cuda_timings.pair_force_collection ); + printf("\n"); + + //neighbor + printf(" Neighbor Binning \t %lf \n",shared_data.cuda_timings.neigh_bin ); + printf(" Neighbor Build \t %lf \n",shared_data.cuda_timings.neigh_build ); + printf(" Neighbor Special \t %lf \n",shared_data.cuda_timings.neigh_special ); + printf("\n"); + + //pppm + if(force->kspace) + { + printf(" PPPM Total \t %lf \n",shared_data.cuda_timings.pppm_compute ); + printf(" PPPM Particle Map \t %lf \n",shared_data.cuda_timings.pppm_particle_map ); + printf(" PPPM Make Rho \t %lf \n",shared_data.cuda_timings.pppm_make_rho ); + printf(" PPPM Brick2fft \t %lf \n",shared_data.cuda_timings.pppm_brick2fft ); + printf(" PPPM Poisson \t %lf \n",shared_data.cuda_timings.pppm_poisson ); + printf(" PPPM Fillbrick \t %lf \n",shared_data.cuda_timings.pppm_fillbrick ); + printf(" PPPM Fieldforce \t %lf \n",shared_data.cuda_timings.pppm_fieldforce ); + printf("\n"); + } + + printf(" Debug Test 1 \t %lf \n",shared_data.cuda_timings.test1); + printf(" Debug Test 2 \t %lf \n",shared_data.cuda_timings.test2); + + printf("\n"); +} diff --git a/src/USER-CUDA/cuda.h b/src/USER-CUDA/cuda.h new file mode 100644 index 0000000000..21d47610fb --- /dev/null +++ b/src/USER-CUDA/cuda.h @@ -0,0 +1,153 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef CUDA_H +#define CUDA_H + +#include "pointers.h" +#include "cuda_shared.h" +#include "cuda_data.h" +#include "cuda_precision.h" +#include + +#ifdef _DEBUG +#define MYDBG(a) a +#else +#define MYDBG(a) +#endif + +namespace LAMMPS_NS +{ + class Cuda : protected Pointers + { + public: + Cuda(class LAMMPS *); + ~Cuda(); + //static void setDevice(class LAMMPS*); + void allocate(); + + void accelerator(int, char **); + + void setSharedDataZero(); + void setSystemParams(); + + void setDomainParams(); + + void checkResize(); + void evsetup_eatom_vatom(int eflag_atom,int vflag_atom); + void uploadAll(); + void downloadAll(); + void downloadX(); + + class CudaNeighList* registerNeighborList(class NeighList* neigh_list); + void uploadAllNeighborLists(); + void downloadAllNeighborLists(); + void set_neighinit(int dist_check, double triggerneighsq) + { + shared_data.atom.dist_check=dist_check; + shared_data.atom.triggerneighsq = triggerneighsq; + } + bool decide_by_integrator() + { + return neighbor_decide_by_integrator && cu_xhold && finished_setup; + } + void update_xhold(int &maxhold,double* xhold); + + void setTimingsZero(); + void print_timings(); + + void cu_x_download() {cu_x->download();} + bool device_set; + bool dotiming; + bool dotestatom; + int testatom; + + double uploadtime,downloadtime; + bool finished_setup,begin_setup; + bool oncpu; + bool finished_run; + + int self_comm; + + int cuda_exists; + + double extent[6]; + int* debugdata; + // data shared between host code and device code + // (number of atoms, device pointers for up- & download) + cuda_shared_data shared_data; + + cCudaData* cu_q; + cCudaData* cu_f; + cCudaData* cu_mass; + cCudaData* cu_rmass; + cCudaData* cu_v; + cCudaData* cu_x; + cCudaData* cu_xhold; + cCudaData* cu_mask; + cCudaData* cu_tag; + cCudaData* cu_type; + cCudaData* cu_image; + cCudaData* cu_eatom; + cCudaData* cu_vatom; + cCudaData* cu_virial; + cCudaData* cu_eng_vdwl; + cCudaData* cu_eng_coul; + cCudaData* cu_extent; + int* binned_id; + cCudaData* cu_binned_id; + int* binned_idnew; + cCudaData* cu_binned_idnew; + cCudaData* cu_debugdata; + cCudaData* cu_radius; + cCudaData* cu_density; + cCudaData* cu_omega; + cCudaData* cu_torque; + cCudaData* cu_special; + cCudaData* cu_nspecial; + cCudaData* cu_molecule; + + + cCudaData* cu_x_type; + X_FLOAT* x_type; + + cCudaData* cu_v_radius; + V_FLOAT* v_radius; + + cCudaData* cu_omega_rmass; + V_FLOAT* omega_rmass; + + cCudaData* cu_map_array; + int neighbor_decide_by_integrator; + + bool pinned; + + void* copy_buffer; + int copy_buffersize; + + private: + std::map neigh_lists; + }; +} + +#endif // CUDA_H diff --git a/src/USER-CUDA/cuda_common.h b/src/USER-CUDA/cuda_common.h new file mode 100644 index 0000000000..d4687ebd06 --- /dev/null +++ b/src/USER-CUDA/cuda_common.h @@ -0,0 +1,344 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef _CUDA_COMMON_H_ +#define _CUDA_COMMON_H_ + +//#include "cutil.h" +#include "cuda_precision.h" +#include "cuda_wrapper_cu.h" + +#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types +//this can not be arbitrarly large, since constant space is limited. +//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types +//Christian +#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE) +#define CUDA_MAX_NSPECIAL 25 + +// define some easy-to-use debug and emulation macros +#ifdef _DEBUG +#define MYDBG(a) a +#else +#define MYDBG(a) +#endif + +#if __DEVICE_EMULATION__ +#define MYEMU(a) a +#else +#define MYEMU(a) +#endif + +#define MYEMUDBG(a) MYEMU(MYDBG(a)) + +// Add Prefix (needed as workaround, same constant's names in different files causes conflict) +#define MY_ADD_PREFIX(prefix, var) prefix##_##var +#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var) +#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var) + +#define MY_VAR_TO_STR(var) #var +#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var) +#define MY_CONST(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var)) + +#define CUDA_USE_TEXTURE +#define CUDA_USE_FLOAT4 + +//constants used by many classes + +//domain +#define _boxhi MY_AP(boxhi) +#define _boxlo MY_AP(boxlo) +#define _subhi MY_AP(subhi) +#define _sublo MY_AP(sublo) +#define _box_size MY_AP(box_size) +#define _prd MY_AP(prd) +#define _periodicity MY_AP(periodicity) +#define _triclinic MY_AP(triclinic) +#define _boxhi_lamda MY_AP(boxhi_lamda) +#define _boxlo_lamda MY_AP(boxlo_lamda) +#define _prd_lamda MY_AP(prd_lamda) +#define _h MY_AP(h) +#define _h_inv MY_AP(h_inv) +#define _h_rate MY_AP(h_rate) +__device__ __constant__ X_FLOAT _boxhi[3]; +__device__ __constant__ X_FLOAT _boxlo[3]; +__device__ __constant__ X_FLOAT _subhi[3]; +__device__ __constant__ X_FLOAT _sublo[3]; +__device__ __constant__ X_FLOAT _box_size[3]; +__device__ __constant__ X_FLOAT _prd[3]; +__device__ __constant__ int _periodicity[3]; +__device__ __constant__ int _triclinic; +__device__ __constant__ X_FLOAT _boxhi_lamda[3]; +__device__ __constant__ X_FLOAT _boxlo_lamda[3]; +__device__ __constant__ X_FLOAT _prd_lamda[3]; +__device__ __constant__ X_FLOAT _h[6]; +__device__ __constant__ X_FLOAT _h_inv[6]; +__device__ __constant__ V_FLOAT _h_rate[6]; + + +//atom properties +#define _x MY_AP(x) +#define _v MY_AP(v) +#define _f MY_AP(f) +#define _tag MY_AP(tag) +#define _type MY_AP(type) +#define _mask MY_AP(mask) +#define _image MY_AP(image) +#define _q MY_AP(q) +#define _mass MY_AP(mass) +#define _rmass MY_AP(rmass) +#define _rmass_flag MY_AP(rmass_flag) +#define _eatom MY_AP(eatom) +#define _vatom MY_AP(vatom) +#define _x_type MY_AP(x_type) +#define _radius MY_AP(radius) +#define _density MY_AP(density) +#define _omega MY_AP(omega) +#define _torque MY_AP(torque) +#define _special MY_AP(special) +#define _maxspecial MY_AP(maxspecial) +#define _nspecial MY_AP(nspecial) +#define _special_flag MY_AP(special_flag) +#define _molecule MY_AP(molecule) +#define _v_radius MY_AP(v_radius) +#define _omega_rmass MY_AP(omega_rmass) +#define _freeze_group_bit MY_AP(freeze_group_bit) +#define _map_array MY_AP(map_array) +__device__ __constant__ X_FLOAT* _x; //holds pointer to positions +__device__ __constant__ V_FLOAT* _v; +__device__ __constant__ F_FLOAT* _f; +__device__ __constant__ int* _tag; +__device__ __constant__ int* _type; +__device__ __constant__ int* _mask; +__device__ __constant__ int* _image; +__device__ __constant__ V_FLOAT* _mass; +__device__ __constant__ F_FLOAT* _q; +__device__ __constant__ V_FLOAT* _rmass; +__device__ __constant__ int _rmass_flag; +__device__ __constant__ ENERGY_FLOAT* _eatom; +__device__ __constant__ ENERGY_FLOAT* _vatom; +__device__ __constant__ X_FLOAT4* _x_type; //holds pointer to positions +__device__ __constant__ X_FLOAT* _radius; +__device__ __constant__ F_FLOAT* _density; +__device__ __constant__ V_FLOAT* _omega; +__device__ __constant__ F_FLOAT* _torque; +__device__ __constant__ int* _special; +__device__ __constant__ int _maxspecial; +__device__ __constant__ int* _nspecial; +__device__ __constant__ int _special_flag[4]; +__device__ __constant__ int* _molecule; +__device__ __constant__ V_FLOAT4* _v_radius; //holds pointer to positions +__device__ __constant__ V_FLOAT4* _omega_rmass; //holds pointer to positions +__device__ __constant__ int _freeze_group_bit; +__device__ __constant__ int* _map_array; + +#ifdef CUDA_USE_TEXTURE + + #define _x_tex MY_AP(x_tex) + #if X_PRECISION == 1 + texture _x_tex; + #else + texture _x_tex; + #endif + + #define _type_tex MY_AP(type_tex) + texture _type_tex; + + #define _x_type_tex MY_AP(x_type_tex) + #if X_PRECISION == 1 + texture _x_type_tex; + #else + texture _x_type_tex; + #endif + + #define _v_radius_tex MY_AP(v_radius_tex) + #if V_PRECISION == 1 + texture _v_radius_tex; + #else + texture _v_radius_tex; + #endif + + #define _omega_rmass_tex MY_AP(omega_rmass_tex) + #if V_PRECISION == 1 + texture _omega_rmass_tex; + #else + texture _omega_rmass_tex; + #endif + + #define _q_tex MY_AP(q_tex) + #if F_PRECISION == 1 + texture _q_tex; + #else + texture _q_tex; + #endif + +#endif + +//neighbor +#ifdef IncludeCommonNeigh +#define _inum MY_AP(inum) +#define _inum_border MY_AP(inum_border) +#define _ilist MY_AP(ilist) +#define _ilist_border MY_AP(ilist_border) +#define _numneigh MY_AP(numneigh) +#define _numneigh_border MY_AP(numneigh_border) +#define _numneigh_inner MY_AP(numneigh_inner) +#define _firstneigh MY_AP(firstneigh) +#define _neighbors MY_AP(neighbors) +#define _neighbors_border MY_AP(neighbors_border) +#define _neighbors_inner MY_AP(neighbors_inner) +#define _reneigh_flag MY_AP(reneigh_flag) +#define _triggerneighsq MY_AP(triggerneighsq) +#define _xhold MY_AP(xhold) +#define _maxhold MY_AP(maxhold) +#define _dist_check MY_AP(dist_check) +#define _neighbor_maxlocal MY_AP(neighbor_maxlocal) +#define _maxneighbors MY_AP(maxneighbors) +#define _overlap_comm MY_AP(overlap_comm) +__device__ __constant__ int _inum; +__device__ __constant__ int* _inum_border; +__device__ __constant__ int* _ilist; +__device__ __constant__ int* _ilist_border; +__device__ __constant__ int* _numneigh; +__device__ __constant__ int* _numneigh_border; +__device__ __constant__ int* _numneigh_inner; +__device__ __constant__ int** _firstneigh; +__device__ __constant__ int* _neighbors; +__device__ __constant__ int* _neighbors_border; +__device__ __constant__ int* _neighbors_inner; +__device__ __constant__ int* _reneigh_flag; +__device__ __constant__ X_FLOAT _triggerneighsq; +__device__ __constant__ X_FLOAT* _xhold; //holds pointer to positions +__device__ __constant__ int _maxhold; +__device__ __constant__ int _dist_check; +__device__ __constant__ int _neighbor_maxlocal; +__device__ __constant__ int _maxneighbors; +__device__ __constant__ int _overlap_comm; +#endif + +//system properties +#define _nall MY_AP(nall) +#define _nghost MY_AP(nghost) +#define _nlocal MY_AP(nlocal) +#define _nmax MY_AP(nmax) +#define _cuda_ntypes MY_AP(cuda_ntypes) +#define _dtf MY_AP(dtf) +#define _dtv MY_AP(dtv) +#define _factor MY_AP(factor) +#define _virial MY_AP(virial) +#define _eng_vdwl MY_AP(eng_vdwl) +#define _eng_coul MY_AP(eng_coul) +#define _molecular MY_AP(molecular) +__device__ __constant__ unsigned _nall; +__device__ __constant__ unsigned _nghost; +__device__ __constant__ unsigned _nlocal; +__device__ __constant__ unsigned _nmax; +__device__ __constant__ unsigned _cuda_ntypes; +__device__ __constant__ V_FLOAT _dtf; +__device__ __constant__ X_FLOAT _dtv; +__device__ __constant__ V_FLOAT _factor; +__device__ __constant__ ENERGY_FLOAT* _virial; +__device__ __constant__ ENERGY_FLOAT* _eng_vdwl; +__device__ __constant__ ENERGY_FLOAT* _eng_coul; +__device__ __constant__ int _molecular; + +//other general constants +#define _buffer MY_AP(buffer) +#define _flag MY_AP(flag) +#define _debugdata MY_AP(debugdata) +__device__ __constant__ void* _buffer; +__device__ __constant__ int* _flag; +__device__ __constant__ int* _debugdata; + +// pointers to data fields on GPU are hold in constant space +// -> reduces register usage and number of parameters for kernelcalls +// will be variables of file scope in cuda files + + + + +// maybe used to output cudaError_t +#define MY_OUTPUT_RESULT(result) \ + switch(result) \ + { \ + case cudaSuccess: printf(" => cudaSuccess\n"); break; \ + case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \ + case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \ + case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \ + case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \ + default: printf(" => unknown\n"); break; \ + } + +#ifdef _DEBUG +# define CUT_CHECK_ERROR(errorMessage) { \ + cudaError_t err = cudaGetLastError(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } \ + err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } \ + } +#else +# define CUT_CHECK_ERROR(errorMessage) { \ + cudaError_t err = cudaGetLastError(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } \ + } +#endif + +# define CUDA_SAFE_CALL_NO_SYNC( call) { \ + cudaError err = call; \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + exit(EXIT_FAILURE); \ + } } + +# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call); + +#define X_MASK 1 +#define V_MASK 2 +#define F_MASK 4 +#define TAG_MASK 8 +#define TYPE_MASK 16 +#define MASK_MASK 32 +#define IMAGE_MASK 64 +#define Q_MASK 128 +#define MOLECULE_MASK 256 +#define RMASS_MASK 512 +#define RADIUS_MASK 1024 +#define DENSITY_MASK 2048 +#define OMEGA_MASK 4096 +#define TORQUE_MASK 8192 + + + +#endif // #ifdef _CUDA_COMMON_H_ diff --git a/src/USER-CUDA/cuda_data.h b/src/USER-CUDA/cuda_data.h new file mode 100644 index 0000000000..6311598332 --- /dev/null +++ b/src/USER-CUDA/cuda_data.h @@ -0,0 +1,796 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef _CUDA_DATA_H_ +#define _CUDA_DATA_H_ + + +enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet +//xx==x in atom_vec x is a member therefore copymode x produces compile errors +#include "cuda_shared.h" +#include "cuda_wrapper_cu.h" +#include "cuda_data_cu.h" +#include + +#include +#include +template +class cCudaData +{ + protected: + void** buffer; + int* buf_size; + host_type* host_data; + dev_array* dev_data_array; + dev_type* temp_data; + unsigned nbytes; + bool owns_dev_array; + bool current_data_on_device; //this is not yet working as intended and therefore deactivated + bool current_data_on_host; + bool is_continues; + bool pinned; + + public: + cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false); + cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false); + ~cCudaData(); + void* dev_data() {if(dev_data_array!=NULL) return dev_data_array->dev_data; else return NULL;}; + void set_dev_data(void* adev_data) {dev_data_array->dev_data=adev_data;}; + void set_dev_array(dev_array* adev_array) {dev_data_array=adev_array;}; + void set_host_data(host_type* host_data); + void* get_host_data() { return host_data;}; + void set_buffer(void** buffer,int* buf_size,bool ais_continues); + unsigned int* get_dim() {return dev_data_array->dim;}; + // if you want to upload data to the gpu, which will not change there, then set will_be_changed=false + // if you want to upload data to the gpu and update it there, then set will_be_changed=true (default) + void upload(bool will_be_changed=true); + void uploadAsync(int stream, bool will_be_changed=true ); + // if you want to download data just to have a look at it, then set will_be_changed=false + // if you are going to modify the downloaded data, then set will_be_changed=true (default) + void download(bool will_be_changed=true); + void downloadAsync(int stream); + void memset_device(int value); + void device_data_has_changed() {current_data_on_device=false;} + void host_data_has_changed() {current_data_on_host=false;} + int dev_size() { + int size = dev_data_array->dim[0]*sizeof(dev_type); + if(dev_data_array->dim[1]) size*=dev_data_array->dim[1]; + if(dev_data_array->dim[2]) size*=dev_data_array->dim[2]; + return size;} +}; + + +template +cCudaData +::cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned) +{ + pinned=is_pinned; + owns_dev_array = false; + current_data_on_device = false; + current_data_on_host = false; + is_continues = false; + this->host_data = host_data; + this->dev_data_array = dev_data_array; + unsigned ndev; + if((mode == x)||(mode==xx)) + { + ndev = dim_x; + dev_data_array->dim[0] = dim_x; + dev_data_array->dim[1] = 0; + dev_data_array->dim[2] = 0; + } + else if(mode == xy || mode == yx ) + { + ndev = dim_x * dim_y; + dev_data_array->dim[0] = dim_x; + dev_data_array->dim[1] = dim_y; + dev_data_array->dim[2] = 0; + } + else + { + ndev = dim_x * dim_y * dim_z; + dev_data_array->dim[0] = dim_x; + dev_data_array->dim[1] = dim_y; + dev_data_array->dim[2] = dim_z; + } + nbytes = ndev * sizeof(dev_type); + if(nbytes<=0) + { + host_data=NULL; + temp_data=NULL; + dev_data_array->dev_data=NULL; + return; + } + + dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes); + if(((mode!=x)&&(mode!=xx)) || typeid(host_type) != typeid(dev_type)) + { + if(not pinned) + temp_data = new dev_type[ndev]; + else + { + temp_data = (dev_type*) CudaWrapper_AllocPinnedHostData(ndev*sizeof(dev_type)); + } + } +} + +template +cCudaData +::cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned) +{ + pinned=is_pinned; + this->dev_data_array = new dev_array; + this->owns_dev_array = true; + current_data_on_device = false; + current_data_on_host = false; + is_continues = false; + this->host_data = host_data; + unsigned ndev; + if((mode == x)||(mode==xx)) + { + ndev = dim_x; + dev_data_array->dim[0] = dim_x; + dev_data_array->dim[1] = 0; + dev_data_array->dim[2] = 0; + } + else if(mode == xy || mode == yx ) + { + ndev = dim_x * dim_y; + dev_data_array->dim[0] = dim_x; + dev_data_array->dim[1] = dim_y; + dev_data_array->dim[2] = 0; + } + else + { + ndev = dim_x * dim_y * dim_z; + dev_data_array->dim[0] = dim_x; + dev_data_array->dim[1] = dim_y; + dev_data_array->dim[2] = dim_z; + } + nbytes = ndev * sizeof(dev_type); + if(nbytes<=0) + { + host_data=NULL; + temp_data=NULL; + dev_data_array->dev_data=NULL; + return; + } + + dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes); + if(((mode!=x)&&(mode!=xx)) || (typeid(host_type) != typeid(dev_type))) + { + if(not pinned) + temp_data = new dev_type[ndev]; + else + { + temp_data = (dev_type*) CudaWrapper_AllocPinnedHostData(ndev*sizeof(dev_type)); + } + } +} + +template +cCudaData +::~cCudaData() +{ + if(((mode!=x)&&(mode!=xx)) || typeid(host_type) != typeid(dev_type)) + { + if(not pinned) + delete [] temp_data; + else + { + CudaWrapper_FreePinnedHostData((void*)temp_data); + } + } + if((dev_data_array->dev_data)&&(nbytes>0)) + CudaWrapper_FreeCudaData(dev_data_array->dev_data,nbytes); + if(owns_dev_array) delete dev_data_array; +} + +template +void cCudaData +::set_host_data(host_type* host_data) +{ + this->host_data = host_data; +} + +template +void cCudaData +::upload(bool will_be_changed) +{ + // if current data is already up, do not re-upload it +// if(current_data_on_device) return; + if(buffer&&is_continues) + { + printf("Actual Buffer: %p %i\n",*buffer,*buf_size); + if(typeid(host_type)==typeid(double)) + { + if(typeid(dev_type)==typeid(double)) + { + CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data, + dev_data_array->dim,mode,*buffer); + current_data_on_device = true; + if(will_be_changed) current_data_on_host = false; + return; + } + else if(typeid(dev_type)==typeid(float)) + { + CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data, + dev_data_array->dim,mode,*buffer); + current_data_on_device = true; + if(will_be_changed) current_data_on_host = false; + return; + } + } + else if(typeid(host_type)==typeid(float)) + { + if(typeid(dev_type)==typeid(double)) + { + CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data, + dev_data_array->dim,mode,*buffer); + current_data_on_device = true; + if(will_be_changed) current_data_on_host = false; + return; + } + else if(typeid(dev_type)==typeid(float)) + { + CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data, + dev_data_array->dim,mode,*buffer); + current_data_on_device = true; + if(will_be_changed) current_data_on_host = false; + return; + } + } + else if(typeid(host_type)==typeid(int)) + { + if(typeid(dev_type)==typeid(int)) + { + CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data, + dev_data_array->dim,mode,*buffer); + current_data_on_device = true; + if(will_be_changed) current_data_on_host = false; + return; + } + } + } + switch(mode) + { + case x: + { + if(typeid(host_type) == typeid(dev_type)) + CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes); + else + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) temp_data[i] = static_cast(host_data[i]); + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufUploadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes); + } + break; + } + + case xx: + { + if(typeid(host_type) == typeid(dev_type)) + CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes); + else + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) temp_data[i] = static_cast(host_data[i]); + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufUploadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes); + } + break; + } + + case xy: + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) + { + dev_type* temp = &temp_data[i * dev_data_array->dim[1]]; + for(unsigned j=0; jdim[1]; ++j) + { + temp[j] = static_cast((reinterpret_cast(host_data))[i][j]); + } + } + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufUploadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes); + break; + } + + case yx: + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned j=0; jdim[1]; ++j) + { + dev_type* temp = &temp_data[j*dev_data_array->dim[0]]; + for(unsigned i=0; idim[0]; ++i) + { + temp[i] = static_cast(reinterpret_cast(host_data)[i][j]); + } + } + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufUploadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes); + break; + } + case xyz: + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) + for(unsigned j=0; jdim[1]; ++j) + { + dev_type* temp = &temp_data[(i*dev_data_array->dim[1]+j)*dev_data_array->dim[2]]; + for(unsigned k=0; kdim[2]; ++k) + { + temp[k] = static_cast(reinterpret_cast(host_data)[i][j][k]); + } + } + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufUploadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes); + break; + } + + case xzy: + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) + for(unsigned k=0; kdim[2]; ++k) + { + dev_type* temp = &temp_data[(i*dev_data_array->dim[2]+k)*dev_data_array->dim[1]]; + for(unsigned j=0; jdim[1]; ++j) + { + temp[j] = static_cast(reinterpret_cast(host_data)[i][j][k]); + } + } + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufUploadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes); + break; + } + } + // we have uploaded the data to the device, i.e.: + current_data_on_device = true; + // the data is going to change on the device, making the host data out-dated + if(will_be_changed) current_data_on_host = false; +} + +template +void cCudaData +::uploadAsync(int stream,bool will_be_changed) +{ + // if current data is already up, do not re-upload it +// if(current_data_on_device) return; + if(buffer&&is_continues) + { + printf("Actual Buffer: %p %i\n",*buffer,*buf_size); + if(typeid(host_type)==typeid(double)) + { + if(typeid(dev_type)==typeid(double)) + { + CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data, + dev_data_array->dim,mode,*buffer); + current_data_on_device = true; + if(will_be_changed) current_data_on_host = false; + return; + } + else if(typeid(dev_type)==typeid(float)) + { + CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data, + dev_data_array->dim,mode,*buffer); + current_data_on_device = true; + if(will_be_changed) current_data_on_host = false; + return; + } + } + else if(typeid(host_type)==typeid(float)) + { + if(typeid(dev_type)==typeid(double)) + { + CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data, + dev_data_array->dim,mode,*buffer); + current_data_on_device = true; + if(will_be_changed) current_data_on_host = false; + return; + } + else if(typeid(dev_type)==typeid(float)) + { + CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data, + dev_data_array->dim,mode,*buffer); + current_data_on_device = true; + if(will_be_changed) current_data_on_host = false; + return; + } + } + else if(typeid(host_type)==typeid(int)) + { + if(typeid(dev_type)==typeid(int)) + { + CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data, + dev_data_array->dim,mode,*buffer); + current_data_on_device = true; + if(will_be_changed) current_data_on_host = false; + return; + } + } + } + switch(mode) + { + case x: + { + if(typeid(host_type) == typeid(dev_type)) + CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream); + else + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) temp_data[i] = static_cast(host_data[i]); + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufUploadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream); + } + break; + } + + case xx: + { + if(typeid(host_type) == typeid(dev_type)) + CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream); + else + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) temp_data[i] = static_cast(host_data[i]); + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufUploadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream); + } + break; + } + + case xy: + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) + { + dev_type* temp = &temp_data[i * dev_data_array->dim[1]]; + for(unsigned j=0; jdim[1]; ++j) + { + temp[j] = static_cast((reinterpret_cast(host_data))[i][j]); + } + } + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufUploadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream); + break; + } + + case yx: + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned j=0; jdim[1]; ++j) + { + dev_type* temp = &temp_data[j*dev_data_array->dim[0]]; + for(unsigned i=0; idim[0]; ++i) + { + temp[i] = static_cast(reinterpret_cast(host_data)[i][j]); + } + } + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufUploadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream); + break; + } + case xyz: + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) + for(unsigned j=0; jdim[1]; ++j) + { + dev_type* temp = &temp_data[(i*dev_data_array->dim[1]+j)*dev_data_array->dim[2]]; + for(unsigned k=0; kdim[2]; ++k) + { + temp[k] = static_cast(reinterpret_cast(host_data)[i][j][k]); + } + } + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufUploadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream); + break; + } + + case xzy: + { + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) + for(unsigned k=0; kdim[2]; ++k) + { + dev_type* temp = &temp_data[(i*dev_data_array->dim[2]+k)*dev_data_array->dim[1]]; + for(unsigned j=0; jdim[1]; ++j) + { + temp[j] = static_cast(reinterpret_cast(host_data)[i][j][k]); + } + } + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufUploadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream); + break; + } + } + // we have uploaded the data to the device, i.e.: + current_data_on_device = true; + // the data is going to change on the device, making the host data out-dated + if(will_be_changed) current_data_on_host = false; +} + +template +void cCudaData +::download(bool will_be_changed) +{ + // if current data is already down, do not re-download it +// if(current_data_on_host) return; + switch(mode) + { + case x: + { + if(typeid(host_type) == typeid(dev_type)) + CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes); + else + { + CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes); + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) host_data[i] = static_cast(temp_data[i]); + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufDownloadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + } + break; + } + + case xx: + { + if(typeid(host_type) == typeid(dev_type)) + CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes); + else + { + CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes); + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) host_data[i] = static_cast(temp_data[i]); + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufDownloadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + } + break; + } + + case xy: + { + CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes); + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) + { + dev_type* temp = &temp_data[i * dev_data_array->dim[1]]; + for(unsigned j=0; jdim[1]; ++j) + { + reinterpret_cast(host_data)[i][j] = static_cast(temp[j]); + } + } + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufDownloadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + break; + } + + case yx: + { + CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes); + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned j=0; jdim[1]; ++j) + { + dev_type* temp = &temp_data[j*dev_data_array->dim[0]]; + for(unsigned i=0; idim[0]; ++i) + { + reinterpret_cast(host_data)[i][j] = static_cast(temp[i]); + } + } + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufDownloadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + break; + } + + case xyz: + { + CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes); + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) + for(unsigned j=0; jdim[1]; ++j) + { + dev_type* temp = &temp_data[(i * dev_data_array->dim[1]+j)*dev_data_array->dim[2]]; + for(unsigned k=0; kdim[2]; ++k) + { + reinterpret_cast(host_data)[i][j][k] = static_cast(temp[k]); + } + } + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufDownloadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + break; + } + + case xzy: + { + CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes); + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + for(unsigned i=0; idim[0]; ++i) + for(unsigned k=0; kdim[2]; ++k) + { + dev_type* temp = &temp_data[(i * dev_data_array->dim[2]+k)*dev_data_array->dim[1]]; + for(unsigned j=0; jdim[1]; ++j) + { + reinterpret_cast(host_data)[i][j][k] = static_cast(temp[j]); + } + } + clock_gettime(CLOCK_REALTIME,&time2); + CudaWrapper_AddCPUBufDownloadTime( + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); + break; + } + } + // we have downloaded the data to the host, i.e.: + current_data_on_host = true; + // the data is going to change on the host, making the device data out-dated + if(will_be_changed) current_data_on_device = false; +} + +template +void cCudaData +::downloadAsync(int stream) +{ + switch(mode) + { + case x: + { + if(typeid(host_type) == typeid(dev_type)) + { + CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream); + CudaWrapper_SyncStream(stream); + } + else + { + CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream); + CudaWrapper_SyncStream(stream); + for(unsigned i=0; idim[0]; ++i) host_data[i] = static_cast(temp_data[i]); + } + break; + } + + case xx: + { + if(typeid(host_type) == typeid(dev_type)) + { + CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream); + CudaWrapper_SyncStream(stream); + } + else + { + CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream); + CudaWrapper_SyncStream(stream); + for(unsigned i=0; idim[0]; ++i) host_data[i] = static_cast(temp_data[i]); + } + break; + } + + case xy: + { + CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream); + CudaWrapper_SyncStream(stream); + for(unsigned i=0; idim[0]; ++i) + { + dev_type* temp = &temp_data[i * dev_data_array->dim[1]]; + for(unsigned j=0; jdim[1]; ++j) + { + reinterpret_cast(host_data)[i][j] = static_cast(temp[j]); + } + } + break; + } + + case yx: + { + CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream); + CudaWrapper_SyncStream(stream); + for(unsigned j=0; jdim[1]; ++j) + { + dev_type* temp = &temp_data[j*dev_data_array->dim[0]]; + for(unsigned i=0; idim[0]; ++i) + { + reinterpret_cast(host_data)[i][j] = static_cast(temp[i]); + } + } + break; + } + } +} + + +template +void cCudaData +::memset_device(int value) +{ + CudaWrapper_Memset(dev_data_array->dev_data,value, nbytes); +} + +template +void cCudaData +::set_buffer(void** abuffer,int* abuf_size,bool ais_continues) +{ + buffer = abuffer; + buf_size = abuf_size; + unsigned nbytes_buf=(nbytes/sizeof(dev_type))*sizeof(host_type); + if(buffer!=NULL) + if(not((typeid(host_type) == typeid(dev_type))&&(mode == x || mode == xx))) + { + printf("Allocate Buffer: %p %i\n",*buffer,*buf_size); + if(((*buffer)!=NULL)&&(*buf_size +#include +#include +#include +#include "cuda.h" +#include "atom.h" + +using namespace LAMMPS_NS; + +CudaNeighList::CudaNeighList(LAMMPS *lmp, class NeighList* neigh_list) : Pointers(lmp) +{ + cuda = lmp->cuda; + MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... start\n");) + this->neigh_list = neigh_list; + neigh_list->cuda_list=this; + sneighlist.maxlocal = neigh_list->get_maxlocal(); + sneighlist.maxneighbors = 32; + sneighlist.maxcut = 0.0; + sneighlist.cutneighsq = NULL; + cu_neighbors = NULL; + cu_neighbors_border = NULL; + cu_neighbors_inner = NULL; + cu_numneigh_border = NULL; + cu_numneigh_inner = NULL; + cu_numneigh = NULL; + cu_ilist = NULL; + cu_ilist_border = NULL; + cu_inum_border = NULL; + inum_border = 0; + neighbors = NULL; + neighbors_inner = NULL; + neighbors_border = NULL; + numneigh_border = NULL; + numneigh_inner = NULL; + ilist_border = NULL; + + build_cuda = false; + sneighlist.binned_id=NULL; + sneighlist.bin_dim=new int[3]; + sneighlist.bin_dim[0]=0; + sneighlist.bin_dim[1]=0; + sneighlist.bin_dim[2]=0; + + cu_ex_type = NULL; + cu_ex1_bit = NULL; + cu_ex2_bit = NULL; + cu_ex_mol_bit = NULL; + sneighlist.nex_type=0; + sneighlist.nex_group=0; + sneighlist.nex_mol=0; + + sneighlist.bin_nmax=0; + sneighlist.bin_extraspace=0.05; + MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... end\n");) + +} + +CudaNeighList::~CudaNeighList() +{ + dev_free(); +} + +void CudaNeighList::dev_alloc() +{ + MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... start\n"); ) + cu_ilist = new cCudaData (neigh_list->ilist , & sneighlist.ilist , sneighlist.maxlocal ); + cu_numneigh = new cCudaData (neigh_list->numneigh, & sneighlist.numneigh , sneighlist.maxlocal ); + neighbors = new int[atom->nmax*sneighlist.maxneighbors]; + cu_neighbors= new cCudaData (neighbors , & sneighlist.neighbors, atom->nmax*sneighlist.maxneighbors ); + + if(cuda->shared_data.overlap_comm) + { + ilist_border = new int[sneighlist.maxlocal]; + numneigh_border = new int[sneighlist.maxlocal]; + numneigh_inner = new int[sneighlist.maxlocal]; + cu_inum_border = new cCudaData (&inum_border , & sneighlist.inum_border , 1 ); + cu_ilist_border = new cCudaData (ilist_border , & sneighlist.ilist_border , sneighlist.maxlocal ); + cu_numneigh_border = new cCudaData (numneigh_border , & sneighlist.numneigh_border , sneighlist.maxlocal ); + cu_numneigh_inner = new cCudaData (numneigh_inner , & sneighlist.numneigh_inner , sneighlist.maxlocal ); + neighbors_border = new int[sneighlist.maxlocal*sneighlist.maxneighbors]; + cu_neighbors_border= new cCudaData (neighbors_border , & sneighlist.neighbors_border, sneighlist.maxlocal*sneighlist.maxneighbors ); + neighbors_inner = new int[sneighlist.maxlocal*sneighlist.maxneighbors]; + cu_neighbors_inner = new cCudaData (neighbors_inner , & sneighlist.neighbors_inner , sneighlist.maxlocal*sneighlist.maxneighbors ); + } + MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... end\n"); ) +} + +void CudaNeighList::dev_free() +{ + MYDBG( printf("# CUDA: CudaNeighList::dev_free() ... start\n"); ) + delete cu_numneigh; + delete cu_ilist; + delete [] neighbors; + delete cu_neighbors; + + if(cuda->shared_data.overlap_comm) + { + delete [] ilist_border; + delete [] numneigh_border; + delete [] numneigh_inner; + delete [] neighbors_border; + delete [] neighbors_inner; + delete cu_inum_border; + delete cu_neighbors_border; + delete cu_neighbors_inner; + delete cu_numneigh_border; + delete cu_numneigh_inner; + delete cu_ilist_border; + } + MYDBG( printf("# CUDA: CudaNeighList::dev_free() ... end\n"); ) +} + +void CudaNeighList::grow_device() +{ + MYDBG(printf("# CUDA: CudaNeighList::grow_device() ... start\n");) + // if host has allocated more memory for atom arrays than device has, then allocate more memory on device + int new_maxlocal = neigh_list->get_maxlocal(); + if(sneighlist.maxlocal < new_maxlocal) + { + sneighlist.maxlocal = new_maxlocal; + dev_free(); + dev_alloc(); + } + + if(!cu_ilist || !cu_numneigh) dev_alloc(); + + // check, if hosts data has been allocated somewhere else + if(cu_ilist ->get_host_data() != neigh_list->ilist) cu_ilist ->set_host_data(neigh_list->ilist); + if(cu_numneigh->get_host_data() != neigh_list->numneigh) cu_numneigh->set_host_data(neigh_list->numneigh); + + MYDBG(printf("# CUDA: CudaNeighList::grow_device() ... end\n");) +} + + +void CudaNeighList::nl_upload(bool will_be_changed) +{ + //return; + MYDBG(printf("# CUDA: CudaNeighList::nl_upload() ... start\n");) + if(cu_ilist) + cu_ilist->upload(); + if(cu_numneigh) + cu_numneigh->upload(); + MYDBG(printf("# CUDA: CudaNeighList::nl_upload() ... end\n");) +} + +void CudaNeighList::nl_download(bool will_be_changed) +{ + MYDBG(printf("# CUDA: CudaNeighList::nl_download() ... start\n");) + if(cu_ilist) + cu_ilist->download(); + if(cu_numneigh) + cu_numneigh->download(); + MYDBG(printf("# CUDA: CudaNeighList::nl_download() ... end\n");) +} + diff --git a/src/USER-CUDA/cuda_neigh_list.h b/src/USER-CUDA/cuda_neigh_list.h new file mode 100644 index 0000000000..97e75b9390 --- /dev/null +++ b/src/USER-CUDA/cuda_neigh_list.h @@ -0,0 +1,83 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef LMP_NEIGH_LIST_CUDA_H +#define LMP_NEIGH_LIST_CUDA_H + +#include "pointers.h" +#include "cuda_data.h" +#include "neigh_list.h" + +namespace LAMMPS_NS +{ + +class CudaNeighList : protected Pointers +{ + public: + cCudaData* cu_ilist; + cCudaData* cu_numneigh; + cCudaData* cu_inum_border; + cCudaData* cu_ilist_border; + cCudaData* cu_numneigh_border; + cCudaData* cu_numneigh_inner; + cCudaData* cu_neighbors; + cCudaData* cu_neighbors_border; + cCudaData* cu_neighbors_inner; + cCudaData* cu_ex_type; + cCudaData* cu_ex1_bit; + cCudaData* cu_ex2_bit; + cCudaData* cu_ex_mol_bit; + + + cuda_shared_neighlist sneighlist; + + int* neighbors; + int* neighbors_inner; + int* neighbors_border; + int inum_border; + int* ilist_border; + int* numneigh_border; + int* numneigh_inner; + int nex_type; + int nex_group; + int nex_mol; + + bool build_cuda; + + CudaNeighList(class LAMMPS *, class NeighList* neigh_list); + ~CudaNeighList(); + void grow_device(); // will grow pages memory on device, keeping old pages. will grow lists memory on device, deleting old lists + void nl_upload(bool will_be_changed=true); + void nl_download(bool will_be_changed=true); + NeighList* neigh_list; + + void dev_alloc(); + void dev_free(); + + private: + class Cuda *cuda; +}; + +} + +#endif diff --git a/src/USER-CUDA/cuda_precision.h b/src/USER-CUDA/cuda_precision.h new file mode 100644 index 0000000000..5b7d6a6843 --- /dev/null +++ b/src/USER-CUDA/cuda_precision.h @@ -0,0 +1,269 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef CUDA_PRECISION_H_ +#define CUDA_PRECISION_H_ +/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA. + * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation). + * ***_FLOAT: type definition of given property + * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F) + */ + +#ifdef CUDA_USE_BINNING +#define CUDA_IF_BINNING(a) a +#else +#define CUDA_IF_BINNING(a) +#endif + +//GLOBAL + +#ifdef CUDA_PRECISION + #if CUDA_PRECISION == 1 + #define CUDA_FLOAT float + #define CUDA_F(x) x##f + #endif + #if CUDA_PRECISION == 2 + #define CUDA_FLOAT double + #define CUDA_F(x) x + #endif +#endif + +#ifndef CUDA_PRECISION + #define CUDA_FLOAT double + #define CUDA_F(x) x + #define CUDA_PRECISION 2 +#endif +//-------------------------------- +//-----------FFT----------------- +//-------------------------------- + +#ifdef FFT_PRECISION_CU + #if FFT_PRECISION_CU == 1 + #define FFT_FLOAT float + #define FFT_F(x) x##f + #endif + #if FFT_PRECISION_CU == 2 + #define FFT_FLOAT double + #define FFT_F(x) x + #endif +#endif + +#ifndef FFT_PRECISION_CU + #define FFT_FLOAT CUDA_FLOAT + #define FFT_F(x) CUDA_F(x) + #define FFT_PRECISION_CU CUDA_PRECISION +#endif + +//-------------------------------- +//-----------PPPM----------------- +//-------------------------------- + +#ifdef PPPM_PRECISION + #if PPPM_PRECISION == 1 + #define PPPM_FLOAT float + #define PPPM_F(x) x##f + #endif + #if PPPM_PRECISION == 2 + #define PPPM_FLOAT double + #define PPPM_F(x) x + #endif +#endif + +#ifndef PPPM_PRECISION + #define PPPM_FLOAT CUDA_FLOAT + #define PPPM_F(x) CUDA_F(x) + #define PPPM_PRECISION CUDA_PRECISION +#endif + +//-------------------------------- +//-----------FORCE----------------- +//-------------------------------- + + +#ifdef F_PRECISION + #if F_PRECISION == 1 + #define F_FLOAT float + #define F_F(x) x##f + #endif + #if F_PRECISION == 2 + #define F_FLOAT double + #define F_F(x) x + #endif +#endif + +#ifndef F_PRECISION + #define F_FLOAT CUDA_FLOAT + #define F_F(x) CUDA_F(x) + #define F_PRECISION CUDA_PRECISION +#endif + +#if F_PRECISION == 1 +#define _SQRT_ sqrtf +#define _RSQRT_ rsqrtf +#define _EXP_ expf +#else +#define _SQRT_ sqrt +#define _RSQRT_ rsqrt +#define _EXP_ exp +#endif + +#if F_PRECISION == 2 +struct F_FLOAT2 +{ + F_FLOAT x; + F_FLOAT y; +}; +struct F_FLOAT3 +{ + F_FLOAT x; + F_FLOAT y; + F_FLOAT z; +}; +struct F_FLOAT4 +{ + F_FLOAT x; + F_FLOAT y; + F_FLOAT z; + F_FLOAT w; +}; +#else +#define F_FLOAT2 float2 +#define F_FLOAT3 float3 +#define F_FLOAT4 float4 +#endif +//-------------------------------- +//-----------ENERGY----------------- +//-------------------------------- + +#ifndef ENERGY_PRECISION + #define ENERGY_FLOAT CUDA_FLOAT + #define ENERGY_F(x) CUDA_F(x) +#endif + +#ifdef ENERGY_PRECISION + #if ENERGY_PRECISION == 1 + #define ENERGY_FLOAT float + #define ENERGY_F(x) x##f + #endif + #if ENERGY_PRECISION == 2 + #define ENERGY_FLOAT double + #define ENERGY_F(x) x + #endif +#endif + +#ifndef ENERGY_PRECISION + #define ENERGY_FLOAT CUDA_FLOAT + #define ENERGY_F(x) CUDA_F(x) + #define ENERGY_PRECISION CUDA_PRECISION +#endif + +//-------------------------------- +//-----------POSITIONS------------ +//-------------------------------- + +#ifdef X_PRECISION + #if X_PRECISION == 1 + #define X_FLOAT float + #define X_F(x) x##f + #endif + #if X_PRECISION == 2 + #define X_FLOAT double + #define X_F(x) x + #endif +#endif + +#ifndef X_PRECISION + #define X_FLOAT CUDA_FLOAT + #define X_F(x) CUDA_F(x) + #define X_PRECISION CUDA_PRECISION +#endif + +#if X_PRECISION == 2 +struct X_FLOAT2 +{ + X_FLOAT x; + X_FLOAT y; +}; +struct X_FLOAT3 +{ + X_FLOAT x; + X_FLOAT y; + X_FLOAT z; +}; +struct X_FLOAT4 +{ + X_FLOAT x; + X_FLOAT y; + X_FLOAT z; + X_FLOAT w; +}; +#else +#define X_FLOAT2 float2 +#define X_FLOAT3 float3 +#define X_FLOAT4 float4 +#endif + +//-------------------------------- +//-----------velocities----------- +//-------------------------------- + +#ifdef V_PRECISION + #if V_PRECISION == 1 + #define V_FLOAT float + #define V_F(x) x##f + #endif + #if V_PRECISION == 2 + #define V_FLOAT double + #define V_F(x) x + #endif +#endif + +#ifndef V_PRECISION + #define V_FLOAT CUDA_FLOAT + #define V_F(x) CUDA_F(x) + #define V_PRECISION CUDA_PRECISION +#endif + +#if V_PRECISION == 2 +struct V_FLOAT4 +{ + V_FLOAT x; + V_FLOAT y; + V_FLOAT z; + V_FLOAT w; +}; +#else +#define V_FLOAT4 float4 +#endif + +#ifdef NO_PREC_TIMING +struct timespec_2 +{ + unsigned int tv_sec; + unsigned int tv_nsec; +}; + +#define timespec timespec_2 +#define clock_gettime(a,b) +#endif +#endif /*CUDA_PRECISION_H_*/ diff --git a/src/USER-CUDA/cuda_shared.h b/src/USER-CUDA/cuda_shared.h new file mode 100644 index 0000000000..f7983fff05 --- /dev/null +++ b/src/USER-CUDA/cuda_shared.h @@ -0,0 +1,378 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef _CUDA_SHARED_H_ +#define _CUDA_SHARED_H_ +#include "cuda_precision.h" + +#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int) + +struct dev_array +{ + void* dev_data; // pointer to memory address on cuda device + unsigned dim[3]; // array dimensions +}; + +struct cuda_shared_atom // relevent data from atom class +{ + dev_array dx; // cumulated distance for binning settings + dev_array x; // position + dev_array v; // velocity + dev_array f; // force + dev_array tag; + dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1) + dev_array mask; + dev_array image; + dev_array q; // charges + dev_array mass; // per-type masses + dev_array rmass; // per-atom masses + dev_array radius; // per-atom radius + dev_array density; + dev_array omega; + dev_array torque; + dev_array molecule; + + dev_array special; + int maxspecial; + dev_array nspecial; + int* special_flag; + int molecular; + + dev_array eatom; // per-atom energy + dev_array vatom; // per-atom virial + int need_eatom; + int need_vatom; + + dev_array x_type; // position + type in X_FLOAT4 struct + dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style + dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style + + double* mass_host; // remember per-type host pointer to masses + //int natoms; // total # of atoms in system, could be 0 + int nghost; // and ghost atoms on this proc + int nlocal; // # of owned + int nall; // total # of atoms in this proc + int nmax; // max # of owned+ghost in arrays on this proc + int ntypes; + int q_flag; // do we have charges? + int rmass_flag; // do we have per-atom masses? + int firstgroup; + int nfirst; + + int update_nlocal; + int update_nmax; + + dev_array xhold; // position at last neighboring + X_FLOAT triggerneighsq; // maximum square movement before reneighboring + int reneigh_flag; // is reneighboring necessary + int maxhold; // size of xhold + int dist_check; //perform distance check for reneighboring + dev_array binned_id; //id of each binned atom (not tag!!) + dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]] + float bin_extraspace; + int bin_dim[3]; + int bin_nmax; + dev_array map_array; +}; + +struct cuda_shared_pair // relevent data from pair class +{ + char cudable_force; // check for (cudable_force!=0) + X_FLOAT cut_global; + X_FLOAT cut_inner_global; + X_FLOAT cut_coul_global; + double** cut; // type-type cutoff + double** cutsq; // type-type cutoff + double** cut_inner; // type-type cutoff for coul + double** cut_coul; // type-type cutoff for coul + double** coeff1; // tpye-type pair parameters + double** coeff2; + double** coeff3; + double** coeff4; + double** coeff5; + double** coeff6; + double** coeff7; + double** coeff8; + double** coeff9; + double** coeff10; + double** offset; + double* special_lj; + double* special_coul; + dev_array virial; // ENERGY_FLOAT + dev_array eng_vdwl; // ENERGY_FLOAT + dev_array eng_coul; // ENERGY_FLOAT + X_FLOAT cut_coulsq_global; + F_FLOAT g_ewald,kappa; + int freeze_group_bit; + + dev_array coeff1_gm; + dev_array coeff2_gm; + dev_array coeff3_gm; + dev_array coeff4_gm; + dev_array coeff5_gm; + dev_array coeff6_gm; + dev_array coeff7_gm; + dev_array coeff8_gm; + dev_array coeff9_gm; + dev_array coeff10_gm; + + int lastgridsize; + int n_energy_virial; + int collect_forces_later; + int use_block_per_atom; + int override_block_per_atom; + +}; + +struct cuda_shared_domain // relevent data from domain class +{ + X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc + X_FLOAT subhi[3]; + X_FLOAT boxlo[3]; + X_FLOAT boxhi[3]; + X_FLOAT prd[3]; + int periodicity[3]; // xyz periodicity as array + + int triclinic; + X_FLOAT xy; + X_FLOAT xz; + X_FLOAT yz; + X_FLOAT boxlo_lamda[3]; + X_FLOAT boxhi_lamda[3]; + X_FLOAT prd_lamda[3]; + X_FLOAT h[6]; + X_FLOAT h_inv[6]; + V_FLOAT h_rate[6]; + int update; +}; + +struct cuda_shared_pppm +{ + char cudable_force; +#ifdef FFT_CUFFT + FFT_FLOAT* work1; + FFT_FLOAT* work2; + FFT_FLOAT* work3; + PPPM_FLOAT* greensfn; + PPPM_FLOAT* fkx; + PPPM_FLOAT* fky; + PPPM_FLOAT* fkz; + PPPM_FLOAT* vg; +#endif + int* part2grid; + PPPM_FLOAT* density_brick; + int* density_brick_int; + PPPM_FLOAT density_intScale; + PPPM_FLOAT* vdx_brick; + PPPM_FLOAT* vdy_brick; + PPPM_FLOAT* vdz_brick; + PPPM_FLOAT* density_fft; + ENERGY_FLOAT* energy; + ENERGY_FLOAT* virial; + int nxlo_in; + int nxhi_in; + int nxlo_out; + int nxhi_out; + int nylo_in; + int nyhi_in; + int nylo_out; + int nyhi_out; + int nzlo_in; + int nzhi_in; + int nzlo_out; + int nzhi_out; + int nx_pppm; + int ny_pppm; + int nz_pppm; + PPPM_FLOAT qqrd2e; + int order; + // float3 sublo; + PPPM_FLOAT* rho_coeff; + int nmax; + int nlocal; + PPPM_FLOAT* debugdata; + PPPM_FLOAT delxinv; + PPPM_FLOAT delyinv; + PPPM_FLOAT delzinv; + int nlower; + int nupper; + PPPM_FLOAT shiftone; + +}; + +struct cuda_shared_comm +{ + int maxswap; + int maxlistlength; + dev_array pbc; + dev_array slablo; + dev_array slabhi; + dev_array multilo; + dev_array multihi; + dev_array sendlist; + int grow_flag; + int comm_phase; + + int nsend; + int* nsend_swap; + int* send_size; + int* recv_size; + double** buf_send; + void** buf_send_dev; + double** buf_recv; + void** buf_recv_dev; + void* buffer; + int buffer_size; + double overlap_split_ratio; +}; + +struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data +{ + int maxlocal; + int inum; // # of I atoms neighbors are stored for local indices of I atoms + int inum_border2; + dev_array inum_border; // # of atoms which interact with border atoms + dev_array ilist; + dev_array ilist_border; + dev_array numneigh; + dev_array numneigh_inner; + dev_array numneigh_border; + dev_array firstneigh; + dev_array neighbors; + dev_array neighbors_border; + dev_array neighbors_inner; + int maxpage; + dev_array page_pointers; + dev_array* pages; + int maxneighbors; + int neigh_lists_per_page; + double** cutneighsq; + CUDA_FLOAT* cu_cutneighsq; + int* binned_id; + int* bin_dim; + int bin_nmax; + float bin_extraspace; + double maxcut; + dev_array ex_type; + int nex_type; + dev_array ex1_bit; + dev_array ex2_bit; + int nex_group; + dev_array ex_mol_bit; + int nex_mol; + +}; + +struct cuda_compile_settings // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files +{ + int prec_glob; + int prec_x; + int prec_v; + int prec_f; + int prec_pppm; + int prec_fft; + int cufft; + int arch; +}; + +struct cuda_timings_struct +{ + //Debug: + double test1; + double test2; + //transfers + double transfer_upload_tmp_constr; + double transfer_download_tmp_deconstr; + + //communication + double comm_forward_total; + double comm_forward_mpi_upper; + double comm_forward_mpi_lower; + double comm_forward_kernel_pack; + double comm_forward_kernel_unpack; + double comm_forward_kernel_self; + double comm_forward_upload; + double comm_forward_download; + + double comm_exchange_total; + double comm_exchange_mpi; + double comm_exchange_kernel_pack; + double comm_exchange_kernel_unpack; + double comm_exchange_kernel_fill; + double comm_exchange_cpu_pack; + double comm_exchange_upload; + double comm_exchange_download; + + double comm_border_total; + double comm_border_mpi; + double comm_border_kernel_pack; + double comm_border_kernel_unpack; + double comm_border_kernel_self; + double comm_border_kernel_buildlist; + double comm_border_upload; + double comm_border_download; + + //pair forces + double pair_xtype_conversion; + double pair_kernel; + double pair_virial; + double pair_force_collection; + + //neighbor + double neigh_bin; + double neigh_build; + double neigh_special; + + //PPPM + double pppm_particle_map; + double pppm_make_rho; + double pppm_brick2fft; + double pppm_poisson; + double pppm_fillbrick; + double pppm_fieldforce; + double pppm_compute; + +}; + +struct cuda_shared_data // holds space for all relevent data from the different classes +{ + void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine] + int buffersize; //maxsize of buffer + int buffer_new; //should be 1 if the pointer to buffer has changed + void* flag; + void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array + cuda_shared_atom atom; + cuda_shared_pair pair; + cuda_shared_domain domain; + cuda_shared_pppm pppm; + cuda_shared_comm comm; + cuda_compile_settings compile_settings; + cuda_timings_struct cuda_timings; + int exchange_dim; + int me; //mpi rank + unsigned int datamask; + int overlap_comm; +}; + + +#endif // #ifndef _CUDA_SHARED_H_ diff --git a/src/USER-CUDA/domain_cuda.cpp b/src/USER-CUDA/domain_cuda.cpp new file mode 100644 index 0000000000..fc8d8bb498 --- /dev/null +++ b/src/USER-CUDA/domain_cuda.cpp @@ -0,0 +1,270 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author (triclinic) : Pieter in 't Veld (SNL) +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include "stdlib.h" +#include "string.h" +#include "stdio.h" +#include "math.h" +#include "domain_cuda.h" +#include "style_region.h" +#include "atom.h" +#include "force.h" +#include "update.h" +#include "modify.h" +#include "fix.h" +#include "fix_deform.h" +#include "region.h" +#include "lattice.h" +#include "comm.h" +#include "memory.h" +#include "error.h" + +#include "cuda.h" +#include "domain_cu.h" + +using namespace LAMMPS_NS; + +#define BIG 1.0e20 +#define SMALL 1.0e-4 +#define DELTA 1 +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +enum{NO_REMAP,X_REMAP,V_REMAP}; // same as fix_deform.cpp + +/* ---------------------------------------------------------------------- + default is periodic +------------------------------------------------------------------------- */ + +DomainCuda::DomainCuda(LAMMPS *lmp) : Domain(lmp) +{ + cuda = lmp->cuda; +} + +/* ---------------------------------------------------------------------- */ + +void DomainCuda::init() +{ + Domain::init(); + + if(not cuda->finished_run) + { + cuda->setDomainParams(); + Cuda_Domain_Init(&cuda->shared_data); + } +} + +/* ---------------------------------------------------------------------- + set global box params + assumes boxlo/hi and triclinic tilts are already set +------------------------------------------------------------------------- */ + +void DomainCuda::set_global_box() +{ + Domain::set_global_box(); + + if(not cuda->finished_run) + { + cuda->setDomainParams(); + } +} + +/* ---------------------------------------------------------------------- + set lamda box params, only need be done one time + assumes global box is defined and proc assignment has been made by comm + for uppermost proc, insure subhi = 1.0 (in case round-off occurs) +------------------------------------------------------------------------- */ + +void DomainCuda::set_lamda_box() +{ + Domain::set_lamda_box(); + + if(not cuda->finished_run) + { + cuda->setDomainParams(); + } +} + +/* ---------------------------------------------------------------------- + set local subbox params + assumes global box is defined and proc assignment has been made + for uppermost proc, insure subhi = boxhi (in case round-off occurs) +------------------------------------------------------------------------- */ + +void DomainCuda::set_local_box() +{ + Domain::set_local_box(); + + if(not cuda->finished_run) + { + // cuda->setDomainParams(); + //Cuda_Domain_Init(&cuda->shared_data); + } +} + +/* ---------------------------------------------------------------------- + reset global & local boxes due to global box boundary changes + if shrink-wrapped, determine atom extent and reset boxlo/hi + if shrink-wrapped and triclinic, perform shrink-wrap in box coords +------------------------------------------------------------------------- */ + +void DomainCuda::reset_box() +{ + if (nonperiodic == 2) { + + // convert back to box coords for shrink-wrap operation + + if (triclinic) lamda2x(atom->nlocal); + + // compute extent of atoms on this proc + + double extent[3][2],all[3][2]; + + extent[2][0] = extent[1][0] = extent[0][0] = BIG; + extent[2][1] = extent[1][1] = extent[0][1] = -BIG; + + double **x = atom->x; + int nlocal = atom->nlocal; + + if (cuda->finished_setup&&(!cuda->oncpu)) + { + extent[0][0]=cuda->extent[0]; + extent[0][1]=cuda->extent[1]; + extent[1][0]=cuda->extent[2]; + extent[1][1]=cuda->extent[3]; + extent[2][0]=cuda->extent[4]; + extent[2][1]=cuda->extent[5]; + } + else + for (int i = 0; i < nlocal; i++) { + extent[0][0] = MIN(extent[0][0],x[i][0]); + extent[0][1] = MAX(extent[0][1],x[i][0]); + extent[1][0] = MIN(extent[1][0],x[i][1]); + extent[1][1] = MAX(extent[1][1],x[i][1]); + extent[2][0] = MIN(extent[2][0],x[i][2]); + extent[2][1] = MAX(extent[2][1],x[i][2]); + } + + // compute extent across all procs + // flip sign of MIN to do it in one Allreduce MAX + + extent[0][0] = -extent[0][0]; + extent[1][0] = -extent[1][0]; + extent[2][0] = -extent[2][0]; + + MPI_Allreduce(extent,all,6,MPI_DOUBLE,MPI_MAX,world); + + // in shrink-wrapped dims, set box by atom extent + // if minimum set, enforce min box size settings + + if (xperiodic == 0) { + if (boundary[0][0] == 2) boxlo[0] = -all[0][0] - SMALL; + else if (boundary[0][0] == 3) boxlo[0] = MIN(-all[0][0]-SMALL,minxlo); + if (boundary[0][1] == 2) boxhi[0] = all[0][1] + SMALL; + else if (boundary[0][1] == 3) boxhi[0] = MAX(all[0][1]+SMALL,minxhi); + if (boxlo[0] > boxhi[0]) error->all("Illegal simulation box"); + } + if (yperiodic == 0) { + if (boundary[1][0] == 2) boxlo[1] = -all[1][0] - SMALL; + else if (boundary[1][0] == 3) boxlo[1] = MIN(-all[1][0]-SMALL,minylo); + if (boundary[1][1] == 2) boxhi[1] = all[1][1] + SMALL; + else if (boundary[1][1] == 3) boxhi[1] = MAX(all[1][1]+SMALL,minyhi); + if (boxlo[1] > boxhi[1]) error->all("Illegal simulation box"); + } + if (zperiodic == 0) { + if (boundary[2][0] == 2) boxlo[2] = -all[2][0] - SMALL; + else if (boundary[2][0] == 3) boxlo[2] = MIN(-all[2][0]-SMALL,minzlo); + if (boundary[2][1] == 2) boxhi[2] = all[2][1] + SMALL; + else if (boundary[2][1] == 3) boxhi[2] = MAX(all[2][1]+SMALL,minzhi); + if (boxlo[2] > boxhi[2]) error->all("Illegal simulation box"); + } + } + + set_global_box(); + set_local_box(); + + if(not cuda->finished_run) + { + cuda->setDomainParams(); + Cuda_Domain_Init(&cuda->shared_data); + } + + // if shrink-wrapped, convert to lamda coords for new box + // must re-invoke pbc() b/c x2lamda result can be outside 0,1 due to roundoff + + if (nonperiodic == 2 && triclinic) { + x2lamda(atom->nlocal); + pbc(); + } +} + +/* ---------------------------------------------------------------------- + enforce PBC and modify box image flags for each atom + called every reneighboring and by other commands that change atoms + resulting coord must satisfy lo <= coord < hi + MAX is important since coord - prd < lo can happen when coord = hi + if fix deform, remap velocity of fix group atoms by box edge velocities + for triclinic, atoms must be in lamda coords (0-1) before pbc is called + image = 10 bits for each dimension + increment/decrement in wrap-around fashion +------------------------------------------------------------------------- */ + +void DomainCuda::pbc() +{ + if(cuda->finished_setup&&(!cuda->oncpu)) + { + cuda->setDomainParams(); + Cuda_Domain_PBC(&cuda->shared_data, deform_vremap, deform_groupbit,cuda->extent); + return; + } + + Domain::pbc(); +} + + +/* ---------------------------------------------------------------------- + convert triclinic 0-1 lamda coords to box coords for all N atoms + x = H lamda + x0; +------------------------------------------------------------------------- */ + +void DomainCuda::lamda2x(int n) +{ + if(cuda->finished_setup&&(!cuda->oncpu)) + { + Cuda_Domain_lamda2x(&cuda->shared_data,n); + return; + } + + Domain::lamda2x(n); +} + +/* ---------------------------------------------------------------------- + convert box coords to triclinic 0-1 lamda coords for all N atoms + lamda = H^-1 (x - x0) +------------------------------------------------------------------------- */ + +void DomainCuda::x2lamda(int n) +{ + if(cuda->finished_setup&&(!cuda->oncpu)) + { + Cuda_Domain_x2lamda(&cuda->shared_data,n); + return; + } + + Domain::x2lamda(n); +} diff --git a/src/USER-CUDA/domain_cuda.h b/src/USER-CUDA/domain_cuda.h new file mode 100644 index 0000000000..1ff4f75946 --- /dev/null +++ b/src/USER-CUDA/domain_cuda.h @@ -0,0 +1,41 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_DOMAIN_CUDA_H +#define LMP_DOMAIN_CUDA_H + +#include "pointers.h" +#include "domain.h" + +namespace LAMMPS_NS { + +class DomainCuda : public Domain { + public: + DomainCuda(class LAMMPS *); + void init(); + void set_global_box(); + void set_lamda_box(); + void set_local_box(); + void reset_box(); + void pbc(); + + void lamda2x(int); + void x2lamda(int); + + protected: + class Cuda *cuda; +}; + +} + +#endif diff --git a/src/USER-CUDA/modify_cuda.cpp b/src/USER-CUDA/modify_cuda.cpp new file mode 100644 index 0000000000..7f8d7f8c5a --- /dev/null +++ b/src/USER-CUDA/modify_cuda.cpp @@ -0,0 +1,442 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include +#include "modify_cuda.h" +#include "style_compute.h" +#include "style_fix.h" +#include "atom.h" +#include "comm.h" +#include "fix.h" +#include "compute.h" +#include "group.h" +#include "update.h" +#include "domain.h" +#include "cuda.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +#define DELTA 4 + +// mask settings - same as in fix.cpp + +#define INITIAL_INTEGRATE 1 +#define POST_INTEGRATE 2 +#define PRE_EXCHANGE 4 +#define PRE_NEIGHBOR 8 +#define PRE_FORCE 16 +#define POST_FORCE 32 +#define FINAL_INTEGRATE 64 +#define END_OF_STEP 128 +#define THERMO_ENERGY 256 +#define INITIAL_INTEGRATE_RESPA 512 +#define POST_INTEGRATE_RESPA 1024 +#define PRE_FORCE_RESPA 2048 +#define POST_FORCE_RESPA 4096 +#define FINAL_INTEGRATE_RESPA 8192 +#define MIN_PRE_EXCHANGE 16384 +#define MIN_POST_FORCE 32768 +#define MIN_ENERGY 65536 + +#include "cuda_modify_flags.h" + +#define MIN(A,B) ((A) < (B)) ? (A) : (B) +#define MAX(A,B) ((A) > (B)) ? (A) : (B) + +#define BIG 1.0e20 + +/* ---------------------------------------------------------------------- */ + +ModifyCuda::ModifyCuda(LAMMPS *lmp) : Modify(lmp) +{ + cuda = lmp->cuda; + + n_initial_integrate_cuda = 0; + n_post_integrate_cuda = 0; + n_pre_exchange = 0; + n_pre_neighbor_cuda = 0; + n_pre_force_cuda = 0; + n_post_force_cuda = 0; + n_final_integrate_cuda = 0; + n_end_of_step_cuda = 0; + n_thermo_energy_cuda = 0; + + n_initial_integrate_host = 0; + n_post_integrate_host = 0; + n_pre_exchange = 0; + n_pre_neighbor_host = 0; + n_pre_force_host = 0; + n_post_force_host = 0; + n_final_integrate_host = 0; + n_end_of_step_host = 0; + n_thermo_energy_host = 0; + + list_initial_integrate_cuda = NULL; + list_post_integrate_cuda = NULL; + list_pre_exchange_cuda = NULL; + list_pre_neighbor_cuda = NULL; + list_pre_force_cuda = NULL; + list_post_force_cuda = NULL; + list_final_integrate_cuda = NULL; + list_end_of_step_cuda = NULL; + list_thermo_energy_cuda = NULL; + end_of_step_every_cuda = NULL; +} + +/* ---------------------------------------------------------------------- */ + +ModifyCuda::~ModifyCuda() +{ + delete [] list_initial_integrate_cuda; + delete [] list_post_integrate_cuda; + delete [] list_pre_exchange_cuda; + delete [] list_pre_neighbor_cuda; + delete [] list_pre_force_cuda; + delete [] list_post_force_cuda; + delete [] list_final_integrate_cuda; + delete [] list_end_of_step_cuda; + delete [] list_thermo_energy_cuda; + delete [] end_of_step_every_cuda; +} + +/* ---------------------------------------------------------------------- + initialize all fixes and computes +------------------------------------------------------------------------- */ + +void ModifyCuda::init() +{ + int i,j; + + // delete storage of restart info since it is not valid after 1st run + + restart_deallocate(); + + // create lists of fixes to call at each stage of run + + list_init(INITIAL_INTEGRATE,n_initial_integrate,list_initial_integrate); + list_init(POST_INTEGRATE,n_post_integrate,list_post_integrate); + list_init(PRE_EXCHANGE,n_pre_exchange,list_pre_exchange); + list_init(PRE_NEIGHBOR,n_pre_neighbor,list_pre_neighbor); + list_init(PRE_FORCE,n_pre_force,list_pre_force); + list_init(POST_FORCE,n_post_force,list_post_force); + list_init(FINAL_INTEGRATE,n_final_integrate,list_final_integrate); + list_init_end_of_step(END_OF_STEP,n_end_of_step,list_end_of_step); + list_init_thermo_energy(THERMO_ENERGY,n_thermo_energy,list_thermo_energy); + + list_init(INITIAL_INTEGRATE_CUDA, n_initial_integrate_cuda, list_initial_integrate_cuda); + list_init(POST_INTEGRATE_CUDA, n_post_integrate_cuda, list_post_integrate_cuda); + list_init(PRE_EXCHANGE_CUDA, n_pre_exchange_cuda, list_pre_exchange_cuda); + list_init(PRE_NEIGHBOR_CUDA, n_pre_neighbor_cuda, list_pre_neighbor_cuda); + list_init(PRE_FORCE_CUDA, n_pre_force_cuda, list_pre_force_cuda); + list_init(POST_FORCE_CUDA, n_post_force_cuda, list_post_force_cuda); + list_init(FINAL_INTEGRATE_CUDA, n_final_integrate_cuda, list_final_integrate_cuda); + list_init_end_of_step_cuda(END_OF_STEP_CUDA, n_end_of_step_cuda, list_end_of_step_cuda); + list_init_thermo_energy(THERMO_ENERGY_CUDA, n_thermo_energy_cuda, list_thermo_energy_cuda); + + n_initial_integrate_host = n_initial_integrate; + n_post_integrate_host = n_post_integrate; + n_pre_exchange_host = n_pre_exchange; + n_pre_neighbor_host = n_pre_neighbor; + n_pre_force_host = n_pre_force; + n_post_force_host = n_post_force; + n_final_integrate_host = n_final_integrate; + n_end_of_step_host = n_end_of_step; + n_thermo_energy_host = n_thermo_energy; + + n_initial_integrate = n_initial_integrate_cuda+n_initial_integrate_host; + n_post_integrate = n_post_integrate_cuda+n_post_integrate_host; + n_pre_exchange = n_pre_exchange_cuda+n_pre_exchange_host; + n_pre_neighbor = n_pre_neighbor_cuda+n_pre_neighbor_host; + n_pre_force = n_pre_force_cuda+n_pre_force_host; + n_post_force = n_post_force_cuda+n_post_force_host; + n_final_integrate = n_final_integrate_cuda+n_final_integrate_host; + n_end_of_step = n_end_of_step_cuda+n_end_of_step_host; + n_thermo_energy = n_thermo_energy_cuda+n_thermo_energy_host; + + list_init(INITIAL_INTEGRATE_RESPA, + n_initial_integrate_respa,list_initial_integrate_respa); + list_init(POST_INTEGRATE_RESPA, + n_post_integrate_respa,list_post_integrate_respa); + list_init(POST_FORCE_RESPA, + n_post_force_respa,list_post_force_respa); + list_init(PRE_FORCE_RESPA, + n_pre_force_respa,list_pre_force_respa); + list_init(FINAL_INTEGRATE_RESPA, + n_final_integrate_respa,list_final_integrate_respa); + + list_init(MIN_PRE_EXCHANGE,n_min_pre_exchange,list_min_pre_exchange); + list_init(MIN_POST_FORCE,n_min_post_force,list_min_post_force); + list_init(MIN_ENERGY,n_min_energy,list_min_energy); + + // init each fix + // needs to come before compute init + // this is b/c some computes call fix->dof() + // FixRigid::dof() depends on its own init having been called + + comm->maxforward_fix = comm->maxreverse_fix = 0; + for (i = 0; i < nfix; i++) fix[i]->init(); + + // set global flag if any fix has its restart_pbc flag set + + restart_pbc_any = 0; + for (i = 0; i < nfix; i++) + if (fix[i]->restart_pbc) restart_pbc_any = 1; + + // create list of computes that store invocation times + + list_init_compute(); + + // init each compute + // set invoked_scalar,vector,etc to -1 to force new run to re-compute them + // add initial timestep to all computes that store invocation times + // since any of them may be invoked by initial thermo + // do not clear out invocation times stored within a compute, + // b/c some may be holdovers from previous run, like for ave fixes + + for (i = 0; i < ncompute; i++) { + compute[i]->init(); + compute[i]->invoked_scalar = -1; + compute[i]->invoked_vector = -1; + compute[i]->invoked_array = -1; + compute[i]->invoked_peratom = -1; + compute[i]->invoked_local = -1; + } + addstep_compute_all(update->ntimestep); + + // warn if any particle is time integrated more than once + + int nlocal = atom->nlocal; + int *mask = atom->mask; + + int *flag = new int[nlocal]; + for (i = 0; i < nlocal; i++) flag[i] = 0; + + int groupbit; + for (i = 0; i < nfix; i++) { + if (fix[i]->time_integrate == 0) continue; + groupbit = fix[i]->groupbit; + for (j = 0; j < nlocal; j++) + if (mask[j] & groupbit) flag[j]++; + } + + int check = 0; + for (i = 0; i < nlocal; i++) + if (flag[i] > 1) check = 1; + + delete [] flag; + + int checkall; + MPI_Allreduce(&check,&checkall,1,MPI_INT,MPI_SUM,world); + if (comm->me == 0 && checkall) + error->warning("One or more atoms are time integrated more than once"); +} + +/* ---------------------------------------------------------------------- + 1st half of integrate call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyCuda::initial_integrate(int vflag) +{ + for(int i = 0; i < n_initial_integrate_cuda; i++) + fix[list_initial_integrate_cuda[i]]->initial_integrate(vflag); + + if(n_initial_integrate_host != 0) + { + cuda->downloadAll(); cuda->oncpu = true; + for (int i = 0; i < n_initial_integrate_host; i++) + fix[list_initial_integrate[i]]->initial_integrate(vflag); + cuda->uploadAll(); cuda->oncpu = false; + } +} + +/* ---------------------------------------------------------------------- + post_integrate call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyCuda::post_integrate() +{ + for(int i = 0; i < n_post_integrate_cuda; i++) + fix[list_post_integrate_cuda[i]]->post_integrate(); + + if(n_post_integrate_host != 0) + { + cuda->downloadAll(); cuda->oncpu = true; + for (int i = 0; i < n_post_integrate_host; i++) + fix[list_post_integrate[i]]->post_integrate(); + cuda->uploadAll(); cuda->oncpu = false; + } +} + +/* ---------------------------------------------------------------------- + pre_exchange call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyCuda::pre_exchange() +{ + for(int i = 0; i < n_pre_exchange_cuda; i++) + fix[list_pre_exchange_cuda[i]]->pre_exchange(); + + if(n_pre_exchange_host != 0) + { + cuda->downloadAll(); cuda->oncpu = true; + for (int i = 0; i < n_pre_exchange_host; i++) + fix[list_pre_exchange[i]]->pre_exchange(); + cuda->uploadAll(); cuda->oncpu = false; + } +} + +/* ---------------------------------------------------------------------- + pre_neighbor call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyCuda::pre_neighbor() +{ + for(int i = 0; i < n_pre_neighbor_cuda; i++) + fix[list_pre_neighbor_cuda[i]]->pre_neighbor(); + + if(n_pre_neighbor_host != 0) + { + cuda->downloadAll(); cuda->oncpu = true; + for (int i = 0; i < n_pre_neighbor_host; i++) + fix[list_pre_neighbor[i]]->pre_neighbor(); + cuda->uploadAll(); cuda->oncpu = false; + } +} + +/* ---------------------------------------------------------------------- + pre_force call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyCuda::pre_force(int vflag) +{ + for(int i = 0; i < n_pre_force_cuda; i++) + fix[list_pre_force_cuda[i]]->pre_force(vflag); + + if(n_pre_force_host != 0) + { + cuda->downloadAll(); cuda->oncpu = true; + for (int i = 0; i < n_pre_force_host; i++) + fix[list_pre_force[i]]->pre_force(vflag); + cuda->uploadAll(); cuda->oncpu = false; + } +} + +/* ---------------------------------------------------------------------- + post_force call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyCuda::post_force(int vflag) +{ + for(int i = 0; i < n_post_force_cuda; i++) + fix[list_post_force_cuda[i]]->post_force(vflag); + + if(n_post_force_host != 0) + { + cuda->downloadAll(); cuda->oncpu = true; + for (int i = 0; i < n_post_force_host; i++) + fix[list_post_force[i]]->post_force(vflag); + cuda->uploadAll(); cuda->oncpu = false; + } +} + +/* ---------------------------------------------------------------------- + 2nd half of integrate call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyCuda::final_integrate() +{ + for (int i = 0; i < n_final_integrate_cuda; i++) + fix[list_final_integrate_cuda[i]]->final_integrate(); + + if(n_final_integrate_host != 0) + { + cuda->downloadAll(); cuda->oncpu = true; + for (int i = 0; i < n_final_integrate_host; i++) + fix[list_final_integrate[i]]->final_integrate(); + cuda->uploadAll(); cuda->oncpu = false; + } +} + +/* ---------------------------------------------------------------------- + end-of-timestep call, only for relevant fixes + only call fix->end_of_step() on timesteps that are multiples of nevery +------------------------------------------------------------------------- */ + +void ModifyCuda::end_of_step() +{ + for (int i = 0; i < n_end_of_step_cuda; i++) + if (update->ntimestep % end_of_step_every_cuda[i] == 0) + fix[list_end_of_step_cuda[i]]->end_of_step(); + + if(n_end_of_step_host != 0) + { + int do_thisstep=0; + for (int i = 0; i < n_end_of_step_host; i++) + if (update->ntimestep % end_of_step_every[i] == 0) do_thisstep=1; + if(do_thisstep) + { + cuda->downloadAll(); cuda->oncpu = true; + for (int i = 0; i < n_end_of_step_host; i++) + if (update->ntimestep % end_of_step_every[i] == 0) + fix[list_end_of_step[i]]->end_of_step(); + cuda->uploadAll(); cuda->oncpu = false; + } + } +} + +/* ---------------------------------------------------------------------- + thermo energy call, only for relevant fixes + called by Thermo class + compute_scalar() is fix call to return energy +------------------------------------------------------------------------- */ + +double ModifyCuda::thermo_energy() +{ + double energy = 0.0; + + for (int i = 0; i < n_thermo_energy_cuda; i++) + energy += fix[list_thermo_energy_cuda[i]]->compute_scalar(); + + if(n_thermo_energy_host != 0) + { + cuda->downloadAll(); cuda->oncpu = true; + for (int i = 0; i < n_thermo_energy_host; i++) + energy += fix[list_thermo_energy[i]]->compute_scalar(); + cuda->uploadAll(); cuda->oncpu = false; + } + + return energy; +} + + + +void ModifyCuda::list_init_end_of_step_cuda(int mask, int &n, int *&list) +{ + delete [] list; + delete [] end_of_step_every_cuda; + + n = 0; + for (int i = 0; i < nfix; i++) if (fmask[i] & mask) n++; + list = new int[n]; + end_of_step_every_cuda = new int[n]; + + n = 0; + for (int i = 0; i < nfix; i++) + if (fmask[i] & mask) { + list[n] = i; + end_of_step_every_cuda[n++] = fix[i]->nevery; + } +} diff --git a/src/USER-CUDA/modify_cuda.h b/src/USER-CUDA/modify_cuda.h new file mode 100644 index 0000000000..7d0d670f9c --- /dev/null +++ b/src/USER-CUDA/modify_cuda.h @@ -0,0 +1,82 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_MODIFY_CUDA_H +#define LMP_MODIFY_CUDA_H + +#include +#include "modify.h" + +namespace LAMMPS_NS { + +class ModifyCuda : public Modify { + public: + + int n_initial_integrate_cuda; + int n_post_integrate_cuda; + int n_pre_exchange_cuda; + int n_pre_neighbor_cuda; + int n_pre_force_cuda; + int n_post_force_cuda; + int n_final_integrate_cuda; + int n_end_of_step_cuda; + int n_thermo_energy_cuda; + + int n_initial_integrate_host; + int n_post_integrate_host; + int n_pre_exchange_host; + int n_pre_neighbor_host; + int n_pre_force_host; + int n_post_force_host; + int n_final_integrate_host; + int n_end_of_step_host; + int n_thermo_energy_host; + + ModifyCuda(class LAMMPS *); + ~ModifyCuda(); + void init(); + void initial_integrate(int); + void post_integrate(); + //void pre_decide(); + void pre_exchange(); + void pre_neighbor(); + void pre_force(int); + void post_force(int); + void final_integrate(); + void end_of_step(); + double thermo_energy(); + + + protected: + class Cuda *cuda; + + // lists of fixes to apply at different stages of timestep + + // list of cuda fixes + int *list_initial_integrate_cuda; + int *list_post_integrate_cuda; + int *list_pre_exchange_cuda; + int *list_pre_neighbor_cuda; + int *list_pre_force_cuda; + int *list_post_force_cuda; + int *list_final_integrate_cuda; + int *list_end_of_step_cuda; + int *list_thermo_energy_cuda; + int *end_of_step_every_cuda; + + void list_init_end_of_step_cuda(int, int &, int *&); +}; + +} + +#endif diff --git a/src/USER-CUDA/neigh_full_cuda.cpp b/src/USER-CUDA/neigh_full_cuda.cpp new file mode 100644 index 0000000000..49678c1d06 --- /dev/null +++ b/src/USER-CUDA/neigh_full_cuda.cpp @@ -0,0 +1,317 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef CUDA +#include "neighbor_cuda.h" +#include "neigh_list.h" +#include "atom.h" +#include "domain.h" +#include "group.h" +#include "error.h" +#include "cuda_neigh_list.h" +#include "cuda.h" +#include "neighbor_cu.h" +#include +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- + N^2 search for all neighbors + every neighbor pair appears in list of both atoms i and j +------------------------------------------------------------------------- */ +void NeighborCuda::full_bin_cuda(NeighList *list) +{ + MYDBG(printf(" # CUDA::NeighFullBinCuda ... start\n");) + if(includegroup) error->warning("Warning using inlcudegroup neighborbuild. This is not yet supported by CUDA neighborbuild styles.\n"); + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + + if(nlocal==0) return; + CudaNeighList* clist=list->cuda_list; + cuda_shared_neighlist* slist=&clist->sneighlist; + + if(not clist) cuda->registerNeighborList(list); + + clist->build_cuda=true; + + if(slist->bin_extraspace<0.09) + { + for(int i=1;i<=atom->ntypes;i++) + for(int j=1;j<=atom->ntypes;j++) + { + if(slist->maxcutmaxcut=cutneighsq[i][j]; + } + slist->maxcut=sqrt(slist->maxcut); + } + int bin_dim_tmp[3]; + int bin_nmax_tmp; +//printf("Hallo\n"); + timespec starttime,endtime; + do + { + do + { + bin_dim_tmp[0]=static_cast ((domain->subhi[0]-domain->sublo[0])/slist->maxcut); + bin_dim_tmp[1]=static_cast ((domain->subhi[1]-domain->sublo[1])/slist->maxcut); + bin_dim_tmp[2]=static_cast ((domain->subhi[2]-domain->sublo[2])/slist->maxcut); + if(bin_dim_tmp[0]==0) bin_dim_tmp[0]+=1; + if(bin_dim_tmp[1]==0) bin_dim_tmp[1]+=1; + if(bin_dim_tmp[2]==0) bin_dim_tmp[2]+=1; + bin_nmax_tmp=static_cast ((1.0+slist->bin_extraspace)*nlocal/(bin_dim_tmp[0]*bin_dim_tmp[1]*bin_dim_tmp[2])); + bin_dim_tmp[0]+=4; + bin_dim_tmp[1]+=4; + bin_dim_tmp[2]+=4; + if(bin_nmax_tmp<32) slist->maxcut*=1.2; + // printf("slist->maxcut: %lf\n", slist->maxcut); + } while(bin_nmax_tmp<32); + if((slist->bin_dim[0]!=bin_dim_tmp[0])||(slist->bin_dim[1]!=bin_dim_tmp[1])||(slist->bin_dim[2]!=bin_dim_tmp[2])||(slist->bin_nmax!=bin_nmax_tmp)) + { + if(slist->binned_id!=NULL) + CudaWrapper_FreeCudaData(slist->binned_id,slist->bin_dim[0]*slist->bin_dim[1]*slist->bin_dim[2]*slist->bin_nmax*sizeof(int)); + slist->bin_dim[0] = bin_dim_tmp[0]; + slist->bin_dim[1] = bin_dim_tmp[1]; + slist->bin_dim[2] = bin_dim_tmp[2]; + slist->bin_nmax = bin_nmax_tmp; + slist->binned_id=(int*) CudaWrapper_AllocCudaData(slist->bin_dim[0]*slist->bin_dim[1]*slist->bin_dim[2]*slist->bin_nmax*sizeof(int)); + //printf("slist->bin: %i %i %i %i \n", bin_dim_tmp[0],bin_dim_tmp[1],bin_dim_tmp[2],bin_nmax_tmp); + } + //if(list->cuda_list->sneighlist.bin_nmax>512) error->all("To many atoms per bin. Likely cause is very long pair cutoff. This needs major rewrite of code and is not yet scheduled to be done.\n"); + }while(Cuda_BinAtoms(&cuda->shared_data, &list->cuda_list->sneighlist)); + + // cuda->cu_debugdata->memset_device(0); + int maxneighbors=slist->maxneighbors; + int *ilist = list->ilist; + int *numneigh = list->numneigh; + + if((nex_type!=slist->nex_type)|| + (nex_group!=slist->nex_group)|| + (nex_mol!=slist->nex_mol)) + { + slist->nex_type=nex_type; + slist->nex_group=nex_group; + slist->nex_mol=nex_mol; + //printf("%i %i %i\n",nex_type,nex_group,nex_mol); + if(nex_type) + { + delete clist->cu_ex_type; + clist->cu_ex_type=new cCudaData (&ex_type[0][0] , & slist->ex_type , (atom->ntypes+1)*(atom->ntypes+1) ); + clist->cu_ex_type->upload(); + } + //printf("AA %i %i %i\n",nex_type,nex_group,nex_mol); + if(nex_group) + { + delete clist->cu_ex1_bit; + clist->cu_ex1_bit=new cCudaData (ex1_bit , & slist->ex1_bit , nex_group ); + clist->cu_ex1_bit->upload(); + //printf("A %i %i %i\n",nex_type,nex_group,nex_mol); + delete clist->cu_ex2_bit; + clist->cu_ex2_bit=new cCudaData (ex2_bit , & slist->ex2_bit , nex_group ); + clist->cu_ex2_bit->upload(); + } + //printf("B %i %i %i\n",nex_type,nex_group,nex_mol); + if(nex_mol) + { + delete clist->cu_ex_mol_bit; + clist->cu_ex_mol_bit=new cCudaData (ex_mol_bit , & slist->ex_mol_bit , nex_mol ); + clist->cu_ex_mol_bit->upload(); + } + //printf("C %i %i %i\n",nex_type,nex_group,nex_mol); + } + int overflow = 0; + int inum = 0; + int npnt = 0; + do + { + npnt=0; + inum=0; + overflow=0; + clist->grow_device(); + slist->cutneighsq=cutneighsq; + slist->maxneighbors=maxneighbors; + slist->inum = list->inum = nlocal; + //list->cuda_list->grow_device(); + if(cuda->shared_data.overlap_comm) + { + list->cuda_list->inum_border=0; + list->cuda_list->cu_inum_border->upload(); + } + + cuda->shared_data.atom.nall=nall; + //Cuda_NeighborReBuildFirstneigh(&cuda->shared_data, &list->cuda_list->sneighlist); + overflow= Cuda_NeighborBuildFullBin(&cuda->shared_data, &list->cuda_list->sneighlist); + + /*cuda->cu_debugdata->download(); + printf("Debugdata: %i ",cuda->debugdata[0]); + for(int i=0;idebugdata[0];i+=3) printf("// %i %i %i",cuda->debugdata[i+1],cuda->debugdata[i+2],cuda->debugdata[i+3]); + printf("\n");*/ + //printf("maxneighborsA: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax); + + if(overflow<0) + { + maxneighbors+=32; + if(-overflow>maxneighbors) maxneighbors=((-overflow+37)/32)*32; + delete list->cuda_list->cu_neighbors; + delete [] list->cuda_list->neighbors; + list->cuda_list->neighbors= new int[slist->maxlocal*maxneighbors]; + list->cuda_list->sneighlist.maxneighbors=maxneighbors; + //printf("maxneighborsA1: %i %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax,slist->maxlocal); + list->cuda_list->cu_neighbors= new cCudaData (list->cuda_list->neighbors , & list->cuda_list->sneighlist.neighbors, slist->maxlocal*maxneighbors ); + //printf("maxneighborsA2: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax); + + if(cuda->shared_data.overlap_comm) + { + list->cuda_list->sneighlist.maxneighbors=maxneighbors; + list->cuda_list->dev_free(); + list->cuda_list->dev_alloc(); + } + //printf("maxneighborsA3: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax); + } + //printf("maxneighborsB: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax); + if(cuda->shared_data.overlap_comm) + { + list->cuda_list->cu_inum_border->download(); + list->cuda_list->sneighlist.inum_border2=list->cuda_list->inum_border; + } + } + while(overflow<0); + + //cuda->cu_debugdata->download(); + // printf("Differences in: %i\n",cuda->debugdata[0]); + // for(int i=0;i<20;i++) printf("%i %i %i %i// ",cuda->debugdata[4*i+1],cuda->debugdata[4*i+2],cuda->debugdata[4*i+3],cuda->debugdata[4*i+4]); +// printf("\n"); +/*for(int i=0;i<10;i++) +{ + printf("%i %i // ",i,numneigh[i]); + for(int j=0;jcuda_list->neighbors[i+j*nlocal]); + printf("\n"); +}*/ +/* int count=0; + if(cuda->shared_data.overlap_comm) + { + list->cuda_list->cu_inum_border->download(); + list->cuda_list->cu_ilist_border->download(); + list->cuda_list->cu_numneigh_border->download(); + list->cuda_list->cu_numneigh_inner->download(); + list->cuda_list->cu_neighbors->download(); + list->cuda_list->cu_neighbors_inner->download(); + list->cuda_list->cu_neighbors_border->download(); + + //list->cuda_list->cu_firstneigh->download(); + // list->cuda_list->nl_download(); + list->cuda_list->cu_numneigh->download(); + int diff=0; + //for(int i=0;icuda_list->inum_border); + //for(int j=0;jnumneigh[i];j++) printf("%i ",list->firstneigh[i][j]);printf("\n"); + for(int j=0;jcuda_list->inum_border;j++) + if(list->cuda_list->ilist_border[j]==i) k=j; + int d=numneigh[i]-list->cuda_list->numneigh_inner[i]; + if(k>-1) d-=list->cuda_list->numneigh_border[k]; + if(d!=0) {printf("Error at %i %i %i %i %i\n",i,k,d,numneigh[i],list->cuda_list->numneigh_inner[i]); diff++;} + if(k>-1 && count<10) + { + printf("Numneighs: %i %i %i Border_i: %i %i\n",numneigh[i],list->cuda_list->numneigh_inner[i],list->cuda_list->numneigh_border[k],k,(int)list->cuda_list->cu_ilist_border->dev_data()); + cuda->shared_data.me=k; + for(int j=0;jcuda_list->neighbors[i+j*nlocal]); + printf("\n"); + for(int j=0;jcuda_list->numneigh_inner[i];j++) + printf("%i ",list->cuda_list->neighbors_inner[i+j*nlocal]); + printf(" // "); + for(int j=0;jcuda_list->numneigh_border[k];j++) + printf("%i ",list->cuda_list->neighbors_border[k+j*nlocal]); + printf("\n"); + count++; + } + } + printf("%i\n",diff); + }*/ + list->cuda_list->cu_numneigh->download(); + list->cuda_list->cu_ilist->download(); + //printf("Done\n"); + + MYDBG(printf(" # CUDA::NeighFullBinCuda ... end\n");) + +} + + +void NeighborCuda::full_nsq_cuda(NeighList *list) +{ + printf("Full_Nsq cuda neighbor list build is not implemented anymore.\n"); +return; +/* + MYDBG(printf(" # CUDA::NeighFullNSQCuda ... start\n");) + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + + if(cuda->cu_xhold) cuda->cu_xhold->upload(); + + + if(not list->cuda_list) cuda->registerNeighborList(list); + list->cuda_list->build_cuda=true; + int maxneighbors=list->cuda_list->sneighlist.maxneighbors; + int neigh_lists_per_page=pgsize/maxneighbors; + int *ilist = list->ilist; + int *numneigh = list->numneigh; + int **firstneigh = list->firstneigh; + int **pages = list->pages; + + int overflow = 0; + int inum = 0; + int npage = 0; + int npnt = 0; + do + { + npage=0; + npnt=0; + inum=0; + overflow=0; + neigh_lists_per_page=pgsize/maxneighbors; + npage=(2*nlocal*maxneighbors-1)/pgsize; + while(npage>list->maxpage) list->add_pages(); + pages = list->pages; + npage=0; + list->cuda_list->sneighlist.neigh_lists_per_page=pgsize/maxneighbors; + list->cuda_list->grow_device(); + list->cuda_list->sneighlist.cutneighsq=cutneighsq; + list->cuda_list->sneighlist.maxneighbors=maxneighbors; + list->cuda_list->sneighlist.inum = list->inum = nlocal; + + cuda->shared_data.atom.nall=nall; + Cuda_NeighborReBuildFirstneigh(&cuda->shared_data, &list->cuda_list->sneighlist); + overflow= not Cuda_NeighborBuildFullNsq(&cuda->shared_data, &list->cuda_list->sneighlist); + + + + if(overflow) maxneighbors+=32; + } + while(overflow); + if(not cudable) list->cuda_list->nl_download(); + MYDBG(printf(" # CUDA::NeighFullNSQCuda ... end\n");) + */ +} +#endif + diff --git a/src/USER-CUDA/neighbor_cuda.cpp b/src/USER-CUDA/neighbor_cuda.cpp new file mode 100644 index 0000000000..4575ce2acc --- /dev/null +++ b/src/USER-CUDA/neighbor_cuda.cpp @@ -0,0 +1,221 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "math.h" +#include "neighbor_cuda.h" +#include "cuda.h" +#include "atom.h" +#include "atom_vec.h" +#include "domain.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "force.h" +#include "group.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +enum{NSQ,BIN,MULTI}; // also in neigh_list.cpp + +/* ---------------------------------------------------------------------- */ + +NeighborCuda::NeighborCuda(LAMMPS *lmp) : Neighbor(lmp) +{ + cuda = lmp->cuda; +} + +/* ---------------------------------------------------------------------- */ + +void NeighborCuda::init() +{ + cuda->set_neighinit(dist_check,0.25*skin*skin); + cudable = 1; + + Neighbor::init(); +} + +/* ---------------------------------------------------------------------- + overwrite either full_nsq or full_bin with CUDA-equivalent methods + any other neighbor build method is unchanged +------------------------------------------------------------------------- */ + +void NeighborCuda::choose_build(int index, NeighRequest *rq) +{ + Neighbor::choose_build(index,rq); + + if (rq->full && style == NSQ && rq->ghost == 0 && rq->cudable) + pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_nsq_cuda; + else if (rq->full && style == BIN && rq->ghost == 0 && rq->cudable) + pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_bin_cuda; +} + +/* ---------------------------------------------------------------------- */ + +int NeighborCuda::check_distance() +{ + double delx,dely,delz,rsq; + double delta,deltasq,delta1,delta2; + + if (boxcheck) { + if (triclinic == 0) { + delx = bboxlo[0] - boxlo_hold[0]; + dely = bboxlo[1] - boxlo_hold[1]; + delz = bboxlo[2] - boxlo_hold[2]; + delta1 = sqrt(delx*delx + dely*dely + delz*delz); + delx = bboxhi[0] - boxhi_hold[0]; + dely = bboxhi[1] - boxhi_hold[1]; + delz = bboxhi[2] - boxhi_hold[2]; + delta2 = sqrt(delx*delx + dely*dely + delz*delz); + delta = 0.5 * (skin - (delta1+delta2)); + deltasq = delta*delta; + } else { + domain->box_corners(); + delta1 = delta2 = 0.0; + for (int i = 0; i < 8; i++) { + delx = corners[i][0] - corners_hold[i][0]; + dely = corners[i][1] - corners_hold[i][1]; + delz = corners[i][2] - corners_hold[i][2]; + delta = sqrt(delx*delx + dely*dely + delz*delz); + if (delta > delta1) delta1 = delta; + else if (delta > delta2) delta2 = delta; + } + delta = 0.5 * (skin - (delta1+delta2)); + deltasq = delta*delta; + } + } else deltasq = triggersq; + + double **x = atom->x; + int nlocal = atom->nlocal; + if (includegroup) nlocal = atom->nfirst; + + int flag = 0; + + if (not cuda->neighbor_decide_by_integrator) { + cuda->cu_x_download(); + for (int i = 0; i < nlocal; i++) { + delx = x[i][0] - xhold[i][0]; + dely = x[i][1] - xhold[i][1]; + delz = x[i][2] - xhold[i][2]; + rsq = delx*delx + dely*dely + delz*delz; + if (rsq > deltasq) flag = 1; + } + } + else flag = cuda->shared_data.atom.reneigh_flag; + + int flagall; + MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_MAX,world); + if (flagall && ago == MAX(every,delay)) ndanger++; + return flagall; +} + +/* ---------------------------------------------------------------------- */ + +void NeighborCuda::build() +{ + int i; + + ago = 0; + ncalls++; + + // store current atom positions and box size if needed + + if (dist_check) { + if (cuda->decide_by_integrator()) + cuda->update_xhold(maxhold, &xhold[0][0]); + else { + if (cuda->finished_setup) cuda->cu_x_download(); + + double **x = atom->x; + int nlocal = atom->nlocal; + if (includegroup) nlocal = atom->nfirst; + if (nlocal > maxhold) { + maxhold = atom->nmax; + memory->destroy(xhold); + memory->create(xhold,maxhold,3,"neigh:xhold"); + } + for (i = 0; i < nlocal; i++) { + xhold[i][0] = x[i][0]; + xhold[i][1] = x[i][1]; + xhold[i][2] = x[i][2]; + } + if (boxcheck) { + if (triclinic == 0) { + boxlo_hold[0] = bboxlo[0]; + boxlo_hold[1] = bboxlo[1]; + boxlo_hold[2] = bboxlo[2]; + boxhi_hold[0] = bboxhi[0]; + boxhi_hold[1] = bboxhi[1]; + boxhi_hold[2] = bboxhi[2]; + } else { + domain->box_corners(); + corners = domain->corners; + for (i = 0; i < 8; i++) { + corners_hold[i][0] = corners[i][0]; + corners_hold[i][1] = corners[i][1]; + corners_hold[i][2] = corners[i][2]; + } + } + } + } + } + + if (not cudable && cuda->finished_setup && atom->avec->cudable) + cuda->downloadAll(); + if (cudable && (not cuda->finished_setup)) { + cuda->checkResize(); + cuda->uploadAll(); + } + + // if any lists store neighbors of ghosts: + // invoke grow() if nlocal+nghost exceeds previous list size + // else only invoke grow() if nlocal exceeds previous list size + // only done for lists with growflag set and which are perpetual + + if (anyghostlist && atom->nlocal+atom->nghost > maxatom) { + maxatom = atom->nmax; + for (i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom); + } else if (atom->nlocal > maxatom) { + maxatom = atom->nmax; + for (i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom); + } + + // extend atom bin list if necessary + + if (style != NSQ && atom->nmax > maxbin) { + maxbin = atom->nmax; + memory->destroy(bins); + memory->create(bins,maxbin,"bins"); + } + + // check that neighbor list with special bond flags will not overflow + + if (atom->nlocal+atom->nghost > NEIGHMASK) + error->one("Too many local+ghost atoms for neighbor list"); + + // invoke building of pair and molecular neighbor lists + // only for pairwise lists with buildflag set + + for (i = 0; i < nblist; i++) + (this->*pair_build[blist[i]])(lists[blist[i]]); + + if (atom->molecular) { + if (force->bond) (this->*bond_build)(); + if (force->angle) (this->*angle_build)(); + if (force->dihedral) (this->*dihedral_build)(); + if (force->improper) (this->*improper_build)(); + } +} diff --git a/src/USER-CUDA/neighbor_cuda.h b/src/USER-CUDA/neighbor_cuda.h new file mode 100644 index 0000000000..c171b23be3 --- /dev/null +++ b/src/USER-CUDA/neighbor_cuda.h @@ -0,0 +1,39 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_NEIGHBOR_CUDA_H +#define LMP_NEIGHBOR_CUDA_H + +#include "neighbor.h" + +namespace LAMMPS_NS { + +class NeighborCuda : public Neighbor { + public: + NeighborCuda(class LAMMPS *); + void init(); + int check_distance(); + void build(); + + private: + class Cuda *cuda; + + void choose_build(int, class NeighRequest *); + typedef void (NeighborCuda::*PairPtr)(class NeighList *); + void full_nsq_cuda(class NeighList *); + void full_bin_cuda(class NeighList *); +}; + +} + +#endif diff --git a/src/USER-CUDA/verlet_cuda.cpp b/src/USER-CUDA/verlet_cuda.cpp new file mode 100644 index 0000000000..259ae8815d --- /dev/null +++ b/src/USER-CUDA/verlet_cuda.cpp @@ -0,0 +1,1103 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + +#include +#include +#include +#include "verlet_cuda.h" +#include "neighbor.h" +#include "domain.h" +#include "comm.h" +#include "atom.h" +#include "atom_vec.h" +#include "force.h" +#include "pair.h" +#include "bond.h" +#include "angle.h" +#include "dihedral.h" +#include "improper.h" +#include "kspace.h" +#include "output.h" +#include "update.h" +#include "modify_cuda.h" +#include "compute.h" +#include "fix.h" +#include "timer.h" +#include "memory.h" +#include "error.h" +#include "cuda_wrapper_cu.h" +#include "thermo.h" +#include "cuda_pair_cu.h" +#include "cuda.h" +#include +#include + +using namespace LAMMPS_NS; + +#define MAX(a, b) ((a)>(b) ? (a) : (b)) +#define MAKETIMEING + + +VerletCuda::VerletCuda(LAMMPS *lmp, int narg, char **arg) : Verlet(lmp, narg, arg) { + cuda = lmp->cuda; + + modify_cuda=(ModifyCuda*) modify; +} + +/* ---------------------------------------------------------------------- + setup before run +------------------------------------------------------------------------- */ + +void VerletCuda::setup() +{ + //debug related variables + cuda->debugdata[0]=0; + cuda->cu_debugdata->upload(); + dotestatom=cuda->dotestatom; + int testatom=cuda->testatom;//48267; + + + MYDBG(printf("# CUDA VerletCuda::setup start\n"); ) + + cuda->oncpu = true; + cuda->begin_setup = true; + cuda->finished_run = false; + strcpy(update->integrate_style,"verlet"); + + time_pair=0; + time_kspace=0; + time_comm=0; + time_modify=0; + time_fulliterate=0; + + atom->setup(); + + cuda_shared_atom* cu_atom = & cuda->shared_data.atom; + cuda_shared_domain* cu_domain = & cuda->shared_data.domain; + cuda_shared_pair* cu_pair = & cuda->shared_data.pair; + cu_atom->update_nlocal=1; + cu_atom->update_nmax=1; + + if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true; + + cuda->setDomainParams(); + + + if(cuda->shared_data.me==0) + printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax); + if(cuda->shared_data.me==0) + printf("# CUDA: Using precision: Global: %u X: %u V: %u F: %u PPPM: %u \n", CUDA_PRECISION==1?4:8,sizeof(X_FLOAT),sizeof(V_FLOAT),sizeof(F_FLOAT),sizeof(PPPM_FLOAT)); + cuda->allocate(); + + + if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n"); + + // setup domain, communication and neighboring + // acquire ghosts + // build neighbor lists + + if (triclinic) domain->x2lamda(atom->nlocal); + domain->pbc(); + domain->reset_box(); + comm->setup(); + if (neighbor->style) neighbor->setup_bins(); + comm->exchange(); + if (atom->sortfreq > 0) atom->sort(); + comm->borders(); + if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost); + cuda->setSystemParams(); + cuda->checkResize(); + + if(cuda->shared_data.me==0) + printf("# CUDA: VerletCuda::setup: Upload data...\n"); + cuda->uploadAll(); + neighbor->build(); + neighbor->ncalls = 0; + cuda->uploadAllNeighborLists(); + if(atom->mass) + cuda->cu_mass->upload(); + + if(cuda->cu_map_array) + cuda->cu_map_array->upload(); + + // compute all forces + + ev_set(update->ntimestep); + if(elist_atom) cuda->shared_data.atom.need_eatom = 1; + if(vlist_atom) cuda->shared_data.atom.need_vatom = 1; + if(elist_atom||vlist_atom) cuda->checkResize(); + + + int test_BpA_vs_TpA = true; + timespec starttime; + timespec endtime; + #ifdef NO_PREC_TIMING + double startsec,endsec; + #endif + //if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = false; + if(test_BpA_vs_TpA && cuda->shared_data.pair.cudable_force && force->pair &&(cuda->shared_data.pair.override_block_per_atom<0)) + { + int StyleLoops=10; + if(cuda->shared_data.me==0) + printf("Test TpA\n"); + cuda->shared_data.pair.use_block_per_atom = 0; + neighbor->build(); + Cuda_Pair_GenerateXType(&cuda->shared_data); + if(cuda->cu_v_radius) + Cuda_Pair_GenerateVRadius(&cuda->shared_data); + if(cuda->cu_omega_rmass) + Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data); + force->pair->compute(eflag,vflag); + CudaWrapper_Sync(); + #ifdef NO_PREC_TIMING + startsec = 1.0*clock()/CLOCKS_PER_SEC; + #endif + clock_gettime(CLOCK_REALTIME,&starttime); + for(int i=0;ishared_data); + if(cuda->cu_v_radius) + Cuda_Pair_GenerateVRadius(&cuda->shared_data); + if(cuda->cu_omega_rmass) + Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data); + force->pair->compute(eflag,vflag); + CudaWrapper_Sync(); + } + clock_gettime(CLOCK_REALTIME,&endtime); + + double TpAtime=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + #ifdef NO_PREC_TIMING + endsec = 1.0*clock()/CLOCKS_PER_SEC; + TpAtime = endsec - startsec; + #endif + if(cuda->shared_data.me==0) + printf("Test BpA\n"); + cuda->shared_data.pair.use_block_per_atom = 1; + neighbor->build(); + Cuda_Pair_GenerateXType(&cuda->shared_data); + if(cuda->cu_v_radius) + Cuda_Pair_GenerateVRadius(&cuda->shared_data); + if(cuda->cu_omega_rmass) + Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data); + force->pair->compute(eflag,vflag); + CudaWrapper_Sync(); + + clock_gettime(CLOCK_REALTIME,&starttime); + #ifdef NO_PREC_TIMING + startsec = 1.0*clock()/CLOCKS_PER_SEC; + #endif + for(int i=0;ishared_data); + if(cuda->cu_v_radius) + Cuda_Pair_GenerateVRadius(&cuda->shared_data); + if(cuda->cu_omega_rmass) + Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data); + force->pair->compute(eflag,vflag); + CudaWrapper_Sync(); + } + clock_gettime(CLOCK_REALTIME,&endtime); + double BpAtime=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + #ifdef NO_PREC_TIMING + endsec = 1.0*clock()/CLOCKS_PER_SEC; + BpAtime = endsec - startsec; + #endif + + if(cuda->shared_data.me==0) + printf("\n# CUDA: Timing of parallelisation layout with %i loops:\n",StyleLoops); + if(cuda->shared_data.me==0) + printf("# CUDA: BpA TpA\n %lf %lf\n",BpAtime,TpAtime); + if(BpAtime>TpAtime) cuda->shared_data.pair.use_block_per_atom = 0; + } + else + cuda->shared_data.pair.use_block_per_atom = cuda->shared_data.pair.override_block_per_atom; + //cuda->shared_data.pair.use_block_per_atom = 0; + if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true; + neighbor->build(); + neighbor->ncalls = 0; + + force_clear(); + + cuda->cu_f->download(); + if(cuda->cu_torque) + cuda->cu_torque->download(); + + //printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]); + + MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute\n"); ) + + test_atom(testatom,"pre pair force"); + + if(cuda->shared_data.pair.cudable_force) + { + cuda->uploadAll(); + Cuda_Pair_GenerateXType(&cuda->shared_data); + if(cuda->cu_v_radius) + Cuda_Pair_GenerateVRadius(&cuda->shared_data); + if(cuda->cu_omega_rmass) + Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data); + } + + if (force->pair) force->pair->compute(eflag,vflag); + + if(cuda->shared_data.pair.cudable_force) + { + if(cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } + cuda->downloadAll(); + } + + test_atom(testatom,"post pair force"); + + MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute done\n"); ) + //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]); + + if (atom->molecular) { + if (force->bond) force->bond->compute(eflag,vflag); + if (force->angle) force->angle->compute(eflag,vflag); + if (force->dihedral) force->dihedral->compute(eflag,vflag); + if (force->improper) force->improper->compute(eflag,vflag); + } + + + if(cuda->shared_data.pppm.cudable_force) + { + cuda->cu_tag ->upload(); + cuda->cu_type->upload(); + cuda->cu_x ->upload(); + cuda->cu_v ->upload(); + cuda->cu_f ->upload(); + if(cu_atom->q_flag) cuda->cu_q->upload(); + } + if (force->kspace) { + force->kspace->setup(); + force->kspace->compute(eflag,vflag); + } + if(cuda->shared_data.pppm.cudable_force) + { + cuda->cu_f ->download(); + } + + test_atom(testatom,"post kspace"); + + cuda->uploadAll(); + if (force->newton) comm->reverse_comm(); + cuda->downloadAll(); + + test_atom(testatom,"post reverse comm"); + + if(cuda->shared_data.me==0) + printf("# CUDA: Total Device Memory useage post setup: %lf MB\n",1.0*CudaWrapper_CheckMemUseage()/1024/1024); + + MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup\n"); ) + modify->setup(vflag); + + MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup done\n"); ) + output->setup(1); + + test_atom(testatom,"post setup"); + + MYDBG( printf("# CUDA: VerletCuda::setup: done\n"); ) + cuda->finished_setup = true; + cuda->oncpu = false; +} + + +//this routine is in a messy state +void VerletCuda::setup_minimal(int flag) +{ + + + dotestatom=0; + int testatom=104; + cuda->oncpu = true; + cuda->begin_setup = true; + cuda->finished_run = false; + MYDBG(printf("# CUDA VerletCuda::setup start\n"); ) + strcpy(update->integrate_style,"verlet"); + time_pair=0; + time_kspace=0; + time_comm=0; + time_modify=0; + time_fulliterate=0; + + //cuda->allocate(); + + cuda_shared_atom* cu_atom = & cuda->shared_data.atom; + cuda_shared_domain* cu_domain = & cuda->shared_data.domain; + cuda_shared_pair* cu_pair = & cuda->shared_data.pair; + cu_atom->update_nlocal=1; + cu_atom->update_nmax=1; + + if(atom->molecular) cuda->shared_data.pair.collect_forces_later = true; + + cuda->setDomainParams(); + + + + if(cuda->shared_data.me==0) + printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax); + cuda->allocate(); + + + + + // setup domain, communication and neighboring + // acquire ghosts + // build neighbor lists + + if (flag) { + if (triclinic) domain->x2lamda(atom->nlocal); + domain->pbc(); + domain->reset_box(); + comm->setup(); + if (neighbor->style) neighbor->setup_bins(); + comm->exchange(); + comm->borders(); + if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost); + cuda->setSystemParams(); + cuda->checkResize(); + neighbor->build(); + neighbor->ncalls = 0; + } + + if(cuda->shared_data.me==0) + printf("# CUDA: VerletCuda::setup: Upload data...\n"); + cuda->uploadAll(); + cuda->uploadAllNeighborLists(); + if(atom->mass) + cuda->cu_mass->upload(); + + if(cuda->cu_map_array) + cuda->cu_map_array->upload(); + + // compute all forces + + ev_set(update->ntimestep); + if(elist_atom) cuda->shared_data.atom.need_eatom = 1; + if(vlist_atom) cuda->shared_data.atom.need_vatom = 1; + if(elist_atom||vlist_atom) cuda->checkResize(); + + force_clear(); + cuda->cu_f->download(); + + //printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]); + + cuda->cu_mass->upload(); + MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute\n"); ) + + test_atom(testatom,"pre pair force"); + + if(cuda->shared_data.pair.cudable_force) + { + cuda->uploadAll(); + Cuda_Pair_GenerateXType(&cuda->shared_data); + if(cuda->cu_v_radius) + Cuda_Pair_GenerateVRadius(&cuda->shared_data); + if(cuda->cu_omega_rmass) + Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data); + } + + if (force->pair) force->pair->compute(eflag,vflag); + + if(cuda->shared_data.pair.cudable_force) + { + if(cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } + cuda->downloadAll(); + } + + test_atom(testatom,"post pair force"); + + MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute done\n"); ) + //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]); + + if (atom->molecular) { + if (force->bond) force->bond->compute(eflag,vflag); + if (force->angle) force->angle->compute(eflag,vflag); + if (force->dihedral) force->dihedral->compute(eflag,vflag); + if (force->improper) force->improper->compute(eflag,vflag); + } + + + if(cuda->shared_data.pppm.cudable_force) + { + cuda->cu_tag ->upload(); + cuda->cu_type->upload(); + cuda->cu_x ->upload(); + cuda->cu_v ->upload(); + cuda->cu_f ->upload(); + if(cu_atom->q_flag) cuda->cu_q->upload(); + } + if (force->kspace) { + force->kspace->setup(); + force->kspace->compute(eflag,vflag); + } + if(cuda->shared_data.pppm.cudable_force) + { + cuda->cu_f ->download(); + } + + test_atom(testatom,"post kspace"); + + cuda->uploadAll(); + if (force->newton) comm->reverse_comm(); + cuda->downloadAll(); + + test_atom(testatom,"post reverse comm"); + + if(cuda->shared_data.me==0) + printf("# CUDA: Total Device Memory useage post setup: %lf MB\n",1.0*CudaWrapper_CheckMemUseage()/1024/1024); + + MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup\n"); ) + modify->setup(vflag); + + MYDBG( printf("# CUDA: VerletCuda::setup: done\n"); ) + cuda->finished_setup=true; + cuda->oncpu=false; +} + +//#define TESTATOM +/* ---------------------------------------------------------------------- + iterate for n steps +------------------------------------------------------------------------- */ + +void VerletCuda::run(int n) +{ + dotestatom=cuda->dotestatom; + int testatom=cuda->testatom;//48267; + + + timespec starttime; + timespec endtime; + timespec starttotal; + timespec endtotal; + + cuda->setTimingsZero(); + + static double testtime=0.0; +// clock_gettime(CLOCK_REALTIME,&starttime); +// clock_gettime(CLOCK_REALTIME,&endtime); +// testtime+=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; +// printf("Time: %lf\n",testtime);*/ + + + cuda_shared_domain* cu_domain = & cuda->shared_data.domain; + + int nflag,ntimestep,sortflag; + + int n_initial_integrate = modify_cuda->n_initial_integrate; + int n_post_integrate = modify_cuda->n_post_integrate; + int n_final_integrate = modify_cuda->n_final_integrate; + int n_pre_exchange = modify_cuda->n_pre_exchange; + int n_pre_neighbor = modify_cuda->n_pre_neighbor; + int n_pre_force = modify_cuda->n_pre_force; + int n_post_force = modify_cuda->n_post_force; + int n_end_of_step = modify_cuda->n_end_of_step; + MYDBG(printf("# CUDA: Fixes: i_int: %i p_int: %i f_int: %i pr_exc: %i pr_neigh: %i pr_f: %i p_f: %i eos: %i\n", + n_initial_integrate,n_post_integrate,n_final_integrate,n_pre_exchange,n_pre_neighbor,n_pre_force,n_post_force,n_end_of_step);) + + if (atom->sortfreq > 0) sortflag = 1; + else sortflag = 0; + + + if(cuda->shared_data.me==0) + { + if((not cuda->shared_data.pair.cudable_force)&&(force->pair)) + error->warning("# CUDA: You asked for a Verlet integration using Cuda, " + "but selected a pair force which has not yet been ported to Cuda"); + if((not cuda->shared_data.pppm.cudable_force)&&(force->kspace)) + error->warning("# CUDA: You asked for a Verlet integration using Cuda, " + "but selected a kspace force which has not yet been ported to Cuda"); + if(modify_cuda->n_post_integrate_host+modify_cuda->n_pre_exchange_host+modify_cuda->n_pre_neighbor_host+modify_cuda->n_pre_force_host+modify_cuda->n_post_force_host+modify_cuda->n_end_of_step_host+modify_cuda->n_initial_integrate_host+modify_cuda->n_final_integrate_host) + error->warning("# CUDA: You asked for a Verlet integration using Cuda, " + "but several fixes have not yet been ported to Cuda.\n" + "This can cause a severe speed penalty due to frequent data synchronization between host and GPU."); + if(atom->firstgroupname) + error->warning("Warning: firstgroupname is used, this will cause additional data transfers."); + } + cuda->uploadAll(); + + if(cuda->neighbor_decide_by_integrator && cuda->cu_xhold) + { + const int n=cuda->shared_data.atom.maxhold; + CudaWrapper_CopyData(cuda->cu_xhold->dev_data(),cuda->cu_x->dev_data(),n*sizeof(X_FLOAT)); + CudaWrapper_CopyData((void*) &((X_FLOAT*)cuda->cu_xhold->dev_data())[n],(void*) &((X_FLOAT*)cuda->cu_x->dev_data())[atom->nmax],n*sizeof(X_FLOAT)); + CudaWrapper_CopyData((void*) &((X_FLOAT*)cuda->cu_xhold->dev_data())[2*n],(void*) &((X_FLOAT*)cuda->cu_x->dev_data())[2*atom->nmax],n*sizeof(X_FLOAT)); + } + + cuda->shared_data.atom.reneigh_flag=0; + cuda->shared_data.atom.update_nlocal=1; + cuda->shared_data.atom.update_nmax=1; + cuda->shared_data.domain.update=1; + cuda->shared_data.buffer_new=1; + cuda->uploadtime=0; + cuda->downloadtime=0; + int firstreneigh=1; + + for(int i = 0; i < n; i++) + { + ntimestep = ++update->ntimestep; + ev_set(ntimestep); + + // initial time integration + + test_atom(testatom,"Pre initial"); + + MYDBG( printf("# CUDA VerletCuda::iterate: before initial_integrate\n"); ) + + modify->initial_integrate(vflag); + + MYDBG( printf("# CUDA VerletCuda::iterate: after initial_integrate\n"); ) + + if(n_post_integrate) modify->post_integrate(); + + + + // regular communication vs neighbor list rebuild + + test_atom(testatom,"Pre Exchange"); + + MYDBG( printf("# CUDA VerletCuda::iterate: before neighbor decide\n"); ) + nflag = neighbor->decide(); + if(nflag == 0) + { + MYDBG( printf("# CUDA VerletCuda::iterate: communicate\n"); ) + timer->stamp(); + + if((not (eflag||vflag))&&(cuda->shared_data.overlap_comm)) + { + //overlap forward communication of ghost atom positions with inner force calculation (interactions between local atoms) + //build communication buffers + // printf("Pre forward_comm(1)\n"); + clock_gettime(CLOCK_REALTIME,&starttotal); + cuda->shared_data.atom.reneigh_flag=0; + clock_gettime(CLOCK_REALTIME,&starttime); + timer->stamp(); + comm->forward_comm(1); + timer->stamp(TIME_COMM); + clock_gettime(CLOCK_REALTIME,&endtime); + cuda->shared_data.cuda_timings.comm_forward_total+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + + //prepare force calculation + // printf("Pre force_clear\n"); + force_clear(); + // printf("Pre Generate XType\n"); + Cuda_Pair_GenerateXType(&cuda->shared_data); + if(cuda->cu_v_radius) + Cuda_Pair_GenerateVRadius(&cuda->shared_data); + if(cuda->cu_omega_rmass) + Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data); + + //start force calculation asynchronus + cuda->shared_data.comm.comm_phase=1; + // printf("Pre Force Compute\n"); + force->pair->compute(eflag, vflag); + timer->stamp(TIME_PAIR); + //CudaWrapper_Sync(); + + //download comm buffers from GPU, perform MPI communication and upload buffers again + clock_gettime(CLOCK_REALTIME,&starttime); + // printf("Pre forward_comm(2)\n"); + comm->forward_comm(2); + clock_gettime(CLOCK_REALTIME,&endtime); + cuda->shared_data.cuda_timings.comm_forward_total+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + timer->stamp(TIME_COMM); + + //wait for force calculation + //printf("Pre Synch\n"); + CudaWrapper_Sync(); + timer->stamp(TIME_PAIR); + + //unpack communication buffers + clock_gettime(CLOCK_REALTIME,&starttime); + // printf("Pre forward_comm(3)\n"); + comm->forward_comm(3); + clock_gettime(CLOCK_REALTIME,&endtime); + // printf("Post forward_comm(3)\n"); + cuda->shared_data.cuda_timings.comm_forward_total+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + + timer->stamp(TIME_COMM); + MYDBG( printf("# CUDA VerletCuda::iterate: communicate done\n"); ) + cuda->shared_data.cuda_timings.test1+= + endtotal.tv_sec-starttotal.tv_sec+1.0*(endtotal.tv_nsec-starttotal.tv_nsec)/1000000000; + } + else + { + //perform standard forward communication + //printf("Forward_comm\n"); + clock_gettime(CLOCK_REALTIME,&starttime); + comm->forward_comm(); + clock_gettime(CLOCK_REALTIME,&endtime); + //printf("Forward_comm_done\n"); + cuda->shared_data.cuda_timings.comm_forward_total+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + timer->stamp(TIME_COMM); + MYDBG( printf("# CUDA VerletCuda::iterate: communicate done\n"); ) + } + } + else + { + int nlocalold=cuda->shared_data.atom.nlocal; + //if(firstreneigh) + { + cuda->shared_data.atom.update_nlocal=1; + cuda->shared_data.atom.update_nmax=1; + firstreneigh=0; + } + cuda->shared_data.buffer_new=1; + MYDBG( printf("# CUDA VerletCuda::iterate: neighbor\n"); ) + cuda->setDomainParams(); + if(n_pre_exchange) modify->pre_exchange(); + if(atom->nlocal!=cuda->shared_data.atom.nlocal) //did someone add atoms during pre_exchange? + { + cuda->checkResize(); + cuda->uploadAll(); + } + + //check domain changes + if(domain->triclinic) domain->x2lamda(atom->nlocal); + MYDBG( printf("# CUDA VerletCuda::iterate: neighbor pbc\n"); ) + domain->pbc(); + if(domain->box_change) + { + domain->reset_box(); + comm->setup(); + if (neighbor->style) neighbor->setup_bins(); + + } + timer->stamp(); + MYDBG( printf("# CUDA VerletCuda::iterate: neighbor exchange\n"); ) + + //perform exchange of local atoms + clock_gettime(CLOCK_REALTIME,&starttime); + comm->exchange(); + clock_gettime(CLOCK_REALTIME,&endtime); + + //special and nspecial fields of the atom data are not currently transfered via the GPU buffer might be changed in the future + if(comm->nprocs>1) + { + clock_gettime(CLOCK_REALTIME,&starttime); + if(atom->special) + cuda->cu_special->upload(); + if(atom->nspecial) + cuda->cu_nspecial->upload(); + clock_gettime(CLOCK_REALTIME,&endtime); + cuda->shared_data.cuda_timings.test1+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + } + + cuda->shared_data.cuda_timings.comm_exchange_total+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + + if(nlocalold!=cuda->shared_data.atom.nlocal) cuda->shared_data.atom.update_nlocal=2; + + //sort atoms + if (sortflag && ntimestep >= atom->nextsort) atom->sort(); + MYDBG( printf("# CUDA VerletCuda::iterate: neighbor borders\n"); ) + + //generate ghost atom lists, and transfer ghost atom data + clock_gettime(CLOCK_REALTIME,&starttime); + comm->borders(); + clock_gettime(CLOCK_REALTIME,&endtime); + cuda->shared_data.cuda_timings.comm_border_total+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + + clock_gettime(CLOCK_REALTIME,&starttime); + //atom index maps are generated on CPU, and need to be transfered to GPU if they are used + if(cuda->cu_map_array) + cuda->cu_map_array->upload(); + + + if(domain->triclinic) domain->lamda2x(atom->nlocal+atom->nghost); + + if(n_pre_neighbor) modify->pre_neighbor(); + + cuda->shared_data.buffer_new=2; + if(atom->molecular) cuda->cu_molecule->download(); + MYDBG( printf("# CUDA VerletCuda::iterate: neighbor build\n"); ) + timer->stamp(TIME_COMM); + clock_gettime(CLOCK_REALTIME,&endtime); + cuda->shared_data.cuda_timings.test2+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + + //rebuild neighbor list + test_atom(testatom,"Pre Neighbor"); + neighbor->build(); + timer->stamp(TIME_NEIGHBOR); + MYDBG( printf("# CUDA VerletCuda::iterate: neighbor done\n"); ) + + //if bonded interactions are used (in this case collect_forces_later is true), transfer data which only changes upon exchange/border routines from GPU to CPU + if(cuda->shared_data.pair.collect_forces_later) + { + if(cuda->cu_molecule) cuda->cu_molecule->download(); + cuda->cu_tag->download(); + cuda->cu_type->download(); + cuda->cu_mask->download(); + if(cuda->cu_q) cuda->cu_q->download(); + } + cuda->shared_data.comm.comm_phase=3; + } + + test_atom(testatom,"Post Exchange"); + + // force computations + + //only do force_clear if it has not been done during overlap of communication with local interactions + if(not((not (eflag||vflag))&&(cuda->shared_data.overlap_comm)&&(cuda->shared_data.comm.comm_phase<3))) + force_clear(); + + if(n_pre_force) modify->pre_force(vflag); + + timer->stamp(); + + //if overlap of bonded interactions with nonbonded interactions takes place, download forces and positions + /* if(cuda->shared_data.pair.collect_forces_later) + { + cuda->cu_x->downloadAsync(2); + cuda->cu_f->downloadAsync(2); + }*/ + + if(force->pair) + { + if((not (eflag||vflag))&&(cuda->shared_data.overlap_comm)&&(cuda->shared_data.comm.comm_phase<3)&&cuda->shared_data.pair.cudable_force) + { + //second part of force calculations in case of overlaping it with commuincation. Only interactions between local and ghost atoms are done now + //regenerate data layout for force computations, its actually only needed for the ghost atoms + cuda->shared_data.comm.comm_phase=2; + + timespec atime1,atime2; + clock_gettime(CLOCK_REALTIME,&atime1); + + Cuda_Pair_GenerateXType(&cuda->shared_data); + if(cuda->cu_v_radius) + Cuda_Pair_GenerateVRadius(&cuda->shared_data); + if(cuda->cu_omega_rmass) + Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data); + + clock_gettime(CLOCK_REALTIME,&atime2); + cuda->shared_data.cuda_timings.pair_xtype_conversion+= + atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; + force->pair->compute(eflag, vflag); + + } + else + { + //calculate complete pair interactions + if(not cuda->shared_data.pair.cudable_force) cuda->downloadAll(); + else + { + //regenerate data layout for force computations, its actually only needed for the ghost atoms + timespec atime1,atime2; + clock_gettime(CLOCK_REALTIME,&atime1); + + Cuda_Pair_GenerateXType(&cuda->shared_data); + if(cuda->cu_v_radius) + Cuda_Pair_GenerateVRadius(&cuda->shared_data); + if(cuda->cu_omega_rmass) + Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data); + + clock_gettime(CLOCK_REALTIME,&atime2); + cuda->shared_data.cuda_timings.pair_xtype_conversion+= + atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000; + } + cuda->shared_data.comm.comm_phase=0; + force->pair->compute(eflag, vflag); + } + + if(not cuda->shared_data.pair.cudable_force) cuda->uploadAll(); + + //wait for force calculation in case of not using overlap with bonded interactions + if(not cuda->shared_data.pair.collect_forces_later) + CudaWrapper_Sync(); + + timer->stamp(TIME_PAIR); + } + + //calculate bonded interactions + if(atom->molecular) + { + cuda->cu_x->downloadAsync(2); + if(n_pre_force==0) Verlet::force_clear(); + else cuda->cu_f->downloadAsync(2); + + timer->stamp(TIME_PAIR); + + test_atom(testatom,"pre bond force"); + if(force->bond) force->bond->compute(eflag, vflag); + if(force->angle) force->angle->compute(eflag, vflag); + if(force->dihedral) force->dihedral->compute(eflag, vflag); + if(force->improper) force->improper->compute(eflag, vflag); + timer->stamp(TIME_BOND); + } + + //collect forces in case pair force and bonded interactions were overlapped, and either no KSPACE or a GPU KSPACE style is used + if(cuda->shared_data.pair.collect_forces_later&&cuda->shared_data.pair.cudable_force&&(not (force->kspace&&(not cuda->shared_data.pppm.cudable_force)))) + { + clock_gettime(CLOCK_REALTIME,&starttime); + cuda->cu_f->uploadAsync(2); + + test_atom(testatom,"post molecular force"); + + + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + timer->stamp(TIME_PAIR); + + clock_gettime(CLOCK_REALTIME,&endtime); + cuda->shared_data.cuda_timings.pair_force_collection+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + } + + //compute kspace force + if(force->kspace) + { + if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later)) + cuda->downloadAll(); + if((not cuda->shared_data.pppm.cudable_force) && (cuda->shared_data.pair.collect_forces_later) && (not atom->molecular)) + { + cuda->cu_x->downloadAsync(2); + if(n_pre_force==0) Verlet::force_clear(); + else cuda->cu_f->downloadAsync(2); + + timer->stamp(TIME_PAIR); + } + + force->kspace->compute(eflag,vflag); + if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later)) + cuda->uploadAll(); + timer->stamp(TIME_KSPACE); + } + + //collect forces in case pair forces and kspace was overlaped + if(cuda->shared_data.pair.collect_forces_later&&cuda->shared_data.pair.cudable_force&&((force->kspace&&(not cuda->shared_data.pppm.cudable_force)))) + { + cuda->cu_f->uploadAsync(2); + + clock_gettime(CLOCK_REALTIME,&starttime); + + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + timer->stamp(TIME_PAIR); + + clock_gettime(CLOCK_REALTIME,&endtime); + cuda->shared_data.cuda_timings.pair_force_collection+= + endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; + } + + //send forces on ghost atoms back to other GPU: THIS SHOULD NEVER HAPPEN + if(force->newton) + { + comm->reverse_comm(); + timer->stamp(TIME_COMM); + } + test_atom(testatom,"post force"); + // force modifications, final time integration, diagnostics + + if(n_post_force) modify->post_force(vflag); + + test_atom(testatom,"pre final"); + + modify->final_integrate(); + + test_atom(testatom,"post final"); + + if(n_end_of_step) modify->end_of_step(); + + // all output + + test_atom(testatom,"pre output"); + + if(ntimestep == output->next) + { + if(not output->thermo->cudable) + cuda->downloadAll(); + timer->stamp(); + output->write(ntimestep); + timer->stamp(TIME_OUTPUT); + } + + + test_atom(testatom,"post output"); + + if(cuda->shared_data.atom.update_nlocal>0) + cuda->shared_data.atom.update_nlocal--; + if(cuda->shared_data.atom.update_nmax>0) + cuda->shared_data.atom.update_nmax--; + if(cuda->shared_data.domain.update>0) + cuda->shared_data.domain.update--; + if(cuda->shared_data.buffer_new>0) + cuda->shared_data.buffer_new--; + cuda->shared_data.atom.reneigh_flag=0; + } + + + cuda->downloadAll(); + cuda->downloadAllNeighborLists(); + cuda->shared_data.atom.update_nlocal=1; + cuda->shared_data.atom.update_nmax=1; + cuda->shared_data.buffer_new=1; + cuda->shared_data.domain.update=1; + cuda->oncpu = true; + cuda->finished_run = true; +} + + +/* ---------------------------------------------------------------------- + clear force on own & ghost atoms + setup and clear other arrays as needed +------------------------------------------------------------------------- */ + +void VerletCuda::force_clear() +{ + cuda->cu_f->memset_device(0); + if(cuda->cu_torque) cuda->cu_torque->memset_device(0); + return; + + //The rest should not be necessary + int i; + for(i=0;inlocal;i++) + { + atom->f[i][0]=0.0; + atom->f[i][1]=0.0; + atom->f[i][2]=0.0; + } + // clear force on all particles + // if either newton flag is set, also include ghosts + + if (neighbor->includegroup == 0) { + int nall; + if (force->newton) nall = atom->nlocal + atom->nghost; + else nall = atom->nlocal; + if (torqueflag) { + double **torque = atom->torque; + for (i = 0; i < nall; i++) { + torque[i][0] = 0.0; + torque[i][1] = 0.0; + torque[i][2] = 0.0; + } + } + + // neighbor includegroup flag is set + // clear force only on initial nfirst particles + // if either newton flag is set, also include ghosts + + } else { + int nall = atom->nfirst; + + + if (torqueflag) { + double **torque = atom->torque; + for (i = 0; i < nall; i++) { + torque[i][0] = 0.0; + torque[i][1] = 0.0; + torque[i][2] = 0.0; + } + } + + if (force->newton) { + nall = atom->nlocal + atom->nghost; + + if (torqueflag) { + double **torque = atom->torque; + for (i = atom->nlocal; i < nall; i++) { + torque[i][0] = 0.0; + torque[i][1] = 0.0; + torque[i][2] = 0.0; + } + } + } + } +} + +void VerletCuda::test_atom(int aatom, char* string) //printing properties of one atom for test purposes +{ + if(not dotestatom) return; + bool check=false; + if(cuda->finished_setup) cuda->downloadAll(); + for(int i=0;inlocal+atom->nghost;i++) + { + if((atom->tag[i]==aatom)&&(inlocal)) + { + + printf("%i # CUDA %s: %i %i %e %e %e %i ",comm->me,string,update->ntimestep,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i); + if(atom->molecular && (inlocal)) + { + printf(" // %i %i %i ",atom->num_bond[i],atom->num_angle[i],atom->num_dihedral[i]); + for(int k=0;knum_bond[i];k++) + printf("// %i %i ",atom->bond_type[i][k],atom->bond_atom[i][k]); + } + printf("\n"); + } + if(inlocal) + { + if((atom->v[i][0]<-100||atom->v[i][0]>100)|| + (atom->v[i][1]<-100||atom->v[i][1]>100)|| + (atom->v[i][2]<-100||atom->v[i][2]>100)|| + (atom->v[i][0]!=atom->v[i][0])|| + (atom->v[i][1]!=atom->v[i][1])|| + (atom->v[i][2]!=atom->v[i][2])) + {printf("%i # CUDA %s velocity: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i); check=true;} + if((atom->f[i][0]<-10000||atom->f[i][0]>10000)|| + (atom->f[i][1]<-10000||atom->f[i][1]>10000)|| + (atom->f[i][2]<-10000||atom->f[i][2]>10000)|| + (atom->f[i][0]!=atom->f[i][0])|| + (atom->f[i][1]!=atom->f[i][1])|| + (atom->f[i][2]!=atom->f[i][2])) + {printf("%i # CUDA %s force: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i); check=true;} + if(atom->tag[i]<=0) + printf("%i # CUDA %s tag: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i); + } + } + if(check) exit(0); +} diff --git a/src/USER-CUDA/verlet_cuda.h b/src/USER-CUDA/verlet_cuda.h new file mode 100644 index 0000000000..6992f438a4 --- /dev/null +++ b/src/USER-CUDA/verlet_cuda.h @@ -0,0 +1,63 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef INTEGRATE_CLASS + +IntegrateStyle(verlet/cuda,VerletCuda) + +#else + + +#ifndef LMP_VERLET_CUDA_H +#define LMP_VERLET_CUDA_H +#include "verlet.h" +#include "modify_cuda.h" + +namespace LAMMPS_NS { + +class VerletCuda : public Verlet +{ + public: + VerletCuda(class LAMMPS *, int, char **); + void setup(); + void setup_minimal(int); + void run(int); + + void test_atom(int atom,char* astring); //debugging purpose + int dotestatom; //debugging purpose + + protected: + class Cuda *cuda; + void force_clear(); + double time_pair; + double time_kspace; + double time_comm; + double time_modify; + double time_fulliterate; + ModifyCuda* modify_cuda; +}; + +} + +#endif +#endif