diff --git a/src/USER-CUDA/Install.sh b/src/USER-CUDA/Install.sh index c11e58b4f6..13cc0fd222 100755 --- a/src/USER-CUDA/Install.sh +++ b/src/USER-CUDA/Install.sh @@ -4,31 +4,189 @@ if (test $1 = 1) then - if (test -e ../Makefile.package) then - sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package - sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package - sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package - sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package - sed -i '1 i include ..\/..\/lib\/cuda\/Makefile.common' ../Makefile.package - sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include |' ../Makefile.package - sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) |' ../Makefile.package - sed -i -e 's|^PKG_LIB =[ \t]*|&-llammpscuda -lcuda -lcudart -lrt |' ../Makefile.package + if (test ! -e ../Makefile.package) then + cp ../Makefile.package.empty ../Makefile.package fi + sed -i -e '/^include.*cuda.*$/d' ../Makefile.package + sed -i -e 's/[^ \t]*cuda[^ \t]* //g' ../Makefile.package + sed -i -e 's/[^ \t]*CUDA[^ \t]* //g' ../Makefile.package + sed -i -e 's/[^ \t]*lrt[^ \t]* //g' ../Makefile.package + sed -i '4 i include ..\/..\/lib\/cuda\/Makefile.common' ../Makefile.package + sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/cuda -DLMP_USER_CUDA |' ../Makefile.package + sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/cuda |' ../Makefile.package + sed -i -e 's|^PKG_LIB =[ \t]*|&-llammpscuda |' ../Makefile.package + sed -i -e 's|^PKG_SYSINC =[ \t]*|&-I$(CUDA_INSTALL_PATH)\/include |' ../Makefile.package + sed -i -e 's|^PKG_SYSPATH =[ \t]*|&-L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(CUDA_USRLIB_CONDITIONAL) |' ../Makefile.package + sed -i -e 's|^PKG_SYSLIB =[ \t]*|&-lcuda -lcudart -lrt |' ../Makefile.package + + if (test -e ../atom_vec_angle.cpp) then + cp atom_vec_angle_cuda.cpp .. + cp atom_vec_angle_cuda.h .. + fi + + if (test -e ../atom_vec_full.cpp) then + cp atom_vec_full_cuda.cpp .. + cp atom_vec_full_cuda.h .. + fi + + if (test -e ../fix_freeze.cpp) then + cp fix_freeze_cuda.cpp .. + cp fix_freeze_cuda.h .. + fi + + if (test -e ../pair_born_coul_long.cpp) then + cp pair_born_coul_long_cuda.cpp .. + cp pair_born_coul_long_cuda.h .. + fi + + if (test -e ../pair_buck_coul_long.cpp) then + cp pair_buck_coul_long_cuda.cpp .. + cp pair_buck_coul_long_cuda.h .. + fi + + if (test -e ../pair_cg_cmm.cpp) then + cp pair_cg_cmm_cuda.cpp .. + cp pair_cg_cmm_coul_cut_cuda.cpp .. + cp pair_cg_cmm_coul_debye_cuda.cpp .. + cp pair_cg_cmm_cuda.h .. + cp pair_cg_cmm_coul_cut_cuda.h .. + cp pair_cg_cmm_coul_debye_cuda.h .. + fi + + if (test -e ../pair_cg_cmm_coul_long.cpp) then + cp pair_cg_cmm_coul_long_cuda.cpp .. + cp pair_cg_cmm_coul_long_cuda.h .. + fi + + if (test -e ../pppm.cpp) then + cp pppm_cuda.cpp .. + cp fft3d_cuda.cpp .. + cp fft3d_wrap_cuda.cpp .. + cp pppm_cuda.h .. + cp fft3d_cuda.h .. + cp fft3d_wrap_cuda.h .. + cp pair_lj_cut_coul_long_cuda.cpp .. + cp pair_lj_cut_coul_long_cuda.h .. + fi + + + if (test -e ../pair_eam.cpp) then + cp pair_eam_alloy_cuda.cpp .. + cp pair_eam_cuda.cpp .. + cp pair_eam_fs_cuda.cpp .. + cp pair_eam_alloy_cuda.h .. + cp pair_eam_cuda.h .. + cp pair_eam_fs_cuda.h .. + fi + + if (test -e ../pair_gran_hooke.cpp) then + cp pair_gran_hooke_cuda.cpp .. + cp pair_gran_hooke_cuda.h .. + fi + + if (test -e ../pair_lj_charmm_coul_charmm.cpp) then + cp pair_lj_charmm_coul_charmm_cuda.cpp .. + cp pair_lj_charmm_coul_charmm_implicit_cuda.cpp .. + cp pair_lj_charmm_coul_charmm_cuda.h .. + cp pair_lj_charmm_coul_charmm_implicit_cuda.h .. + if (test -e ../pair_lj_charmm_coul_long.cpp) then + cp pair_lj_charmm_coul_long_cuda.cpp .. + cp pair_lj_charmm_coul_long_cuda.h .. + fi + fi + + if (test -e ../pair_lj_class2.cpp) then + cp pair_lj_class2_coul_cut_cuda.cpp .. + cp pair_lj_class2_cuda.cpp .. + cp pair_lj_class2_coul_cut_cuda.h .. + cp pair_lj_class2_cuda.h .. + if (test -e ../pair_lj_class2_coul_long.cpp) then + cp pair_lj_class2_coul_long_cuda.cpp .. + cp pair_lj_class2_coul_long_cuda.h .. + fi + fi + + cp atom_vec_atomic_cuda.cpp .. + cp atom_vec_charge_cuda.cpp .. cp comm_cuda.cpp .. + cp compute_pe_cuda.cpp .. + cp compute_pressure_cuda.cpp .. + cp compute_temp_cuda.cpp .. + cp compute_temp_partial_cuda.cpp .. cp domain_cuda.cpp .. + cp fix_addforce_cuda.cpp .. + cp fix_aveforce_cuda.cpp .. + cp fix_enforce2d_cuda.cpp .. + cp fix_gravity_cuda.cpp .. + cp fix_nh_cuda.cpp .. + cp fix_npt_cuda.cpp .. + cp fix_nve_cuda.cpp .. + cp fix_nvt_cuda.cpp .. + cp fix_set_force_cuda.cpp .. + cp fix_shake_cuda.cpp .. + cp fix_temp_berendsen_cuda.cpp .. + cp fix_temp_rescale_cuda.cpp .. + cp fix_temp_rescale_limit_cuda.cpp .. + cp fix_viscous_cuda.cpp .. cp modify_cuda.cpp .. cp neighbor_cuda.cpp .. cp neigh_full_cuda.cpp .. + cp pair_buck_coul_cut_cuda.cpp .. + cp pair_buck_cuda.cpp .. + cp pair_lj96_cut_cuda.cpp .. + cp pair_lj_cut_coul_cut_cuda.cpp .. + cp pair_lj_cut_coul_debye_cuda.cpp .. + cp pair_lj_cut_cuda.cpp .. + cp pair_lj_cut_experimental_cuda.cpp .. + cp pair_lj_expand_cuda.cpp .. + cp pair_lj_gromacs_coul_gromacs_cuda.cpp .. + cp pair_lj_gromacs_cuda.cpp .. + cp pair_lj_smooth_cuda.cpp .. + cp pair_morse_cuda.cpp .. + cp pppm_cuda.cpp .. cp verlet_cuda.cpp .. cp cuda.cpp .. cp cuda_neigh_list.cpp .. + cp atom_vec_atomic_cuda.h .. + cp atom_vec_charge_cuda.h .. cp comm_cuda.h .. + cp compute_pe_cuda.h .. + cp compute_pressure_cuda.h .. + cp compute_temp_cuda.h .. + cp compute_temp_partial_cuda.h .. cp domain_cuda.h .. + cp fix_addforce_cuda.h .. + cp fix_aveforce_cuda.h .. + cp fix_enforce2d_cuda.h .. + cp fix_gravity_cuda.h .. + cp fix_nh_cuda.h .. + cp fix_npt_cuda.h .. + cp fix_nve_cuda.h .. + cp fix_nvt_cuda.h .. + cp fix_set_force_cuda.h .. + cp fix_shake_cuda.h .. + cp fix_temp_berendsen_cuda.h .. + cp fix_temp_rescale_cuda.h .. + cp fix_temp_rescale_limit_cuda.h .. + cp fix_viscous_cuda.h .. cp modify_cuda.h .. cp neighbor_cuda.h .. + cp pair_buck_coul_cut_cuda.h .. + cp pair_buck_cuda.h .. + + cp pair_lj96_cut_cuda.h .. + cp pair_lj_cut_coul_cut_cuda.h .. + cp pair_lj_cut_coul_debye_cuda.h .. + cp pair_lj_cut_cuda.h .. + cp pair_lj_cut_experimental_cuda.h .. + cp pair_lj_expand_cuda.h .. + cp pair_lj_gromacs_coul_gromacs_cuda.h .. + cp pair_lj_gromacs_cuda.h .. + cp pair_lj_smooth_cuda.h .. + cp pair_morse_cuda.h .. cp verlet_cuda.h .. cp cuda.h .. @@ -42,26 +200,136 @@ if (test $1 = 1) then elif (test $1 = 0) then if (test -e ../Makefile.package) then - sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package - sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package - sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package - sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package + sed -i -e '/^include.*cuda.*$/d' ../Makefile.package + sed -i -e 's/[^ \t]*cuda[^ \t]* //g' ../Makefile.package + sed -i -e 's/[^ \t]*CUDA[^ \t]* //g' ../Makefile.package + sed -i -e 's/[^ \t]*lrt[^ \t]* //g' ../Makefile.package fi + rm ../atom_vec_angle_cuda.cpp + rm ../atom_vec_atomic_cuda.cpp + rm ../atom_vec_charge_cuda.cpp + rm ../atom_vec_full_cuda.cpp rm ../comm_cuda.cpp + rm ../compute_pe_cuda.cpp + rm ../compute_pressure_cuda.cpp + rm ../compute_temp_cuda.cpp + rm ../compute_temp_partial_cuda.cpp rm ../domain_cuda.cpp + rm ../fft3d_cuda.cpp + rm ../fft3d_wrap_cuda.cpp + rm ../fix_addforce_cuda.cpp + rm ../fix_aveforce_cuda.cpp + rm ../fix_enforce2d_cuda.cpp + rm ../fix_freeze_cuda.cpp + rm ../fix_gravity_cuda.cpp + rm ../fix_nh_cuda.cpp + rm ../fix_npt_cuda.cpp + rm ../fix_nve_cuda.cpp + rm ../fix_nvt_cuda.cpp + rm ../fix_set_force_cuda.cpp + rm ../fix_shake_cuda.cpp + rm ../fix_temp_berendsen_cuda.cpp + rm ../fix_temp_rescale_cuda.cpp + rm ../fix_temp_rescale_limit_cuda.cpp + rm ../fix_viscous_cuda.cpp rm ../modify_cuda.cpp rm ../neighbor_cuda.cpp rm ../neigh_full_cuda.cpp + rm ../pair_born_coul_long_cuda.cpp + rm ../pair_buck_coul_cut_cuda.cpp + rm ../pair_buck_coul_long_cuda.cpp + rm ../pair_buck_cuda.cpp + rm ../pair_cg_cmm_coul_cut_cuda.cpp + rm ../pair_cg_cmm_coul_debye_cuda.cpp + rm ../pair_cg_cmm_coul_long_cuda.cpp + rm ../pair_cg_cmm_cuda.cpp + rm ../pair_eam_alloy_cuda.cpp + rm ../pair_eam_cuda.cpp + rm ../pair_eam_fs_cuda.cpp + rm ../pair_gran_hooke_cuda.cpp + rm ../pair_lj96_cut_cuda.cpp + rm ../pair_lj_charmm_coul_charmm_cuda.cpp + rm ../pair_lj_charmm_coul_charmm_implicit_cuda.cpp + rm ../pair_lj_charmm_coul_long_cuda.cpp + rm ../pair_lj_class2_coul_cut_cuda.cpp + rm ../pair_lj_class2_coul_long_cuda.cpp + rm ../pair_lj_class2_cuda.cpp + rm ../pair_lj_cut_coul_cut_cuda.cpp + rm ../pair_lj_cut_coul_debye_cuda.cpp + rm ../pair_lj_cut_coul_long_cuda.cpp + rm ../pair_lj_cut_cuda.cpp + rm ../pair_lj_cut_experimental_cuda.cpp + rm ../pair_lj_expand_cuda.cpp + rm ../pair_lj_gromacs_coul_gromacs_cuda.cpp + rm ../pair_lj_gromacs_cuda.cpp + rm ../pair_lj_smooth_cuda.cpp + rm ../pair_morse_cuda.cpp + rm ../pppm_cuda.cpp rm ../verlet_cuda.cpp rm ../cuda.cpp rm ../cuda_neigh_list.cpp + rm ../atom_vec_angle_cuda.h + rm ../atom_vec_atomic_cuda.h + rm ../atom_vec_charge_cuda.h + rm ../atom_vec_full_cuda.h rm ../comm_cuda.h + rm ../compute_pe_cuda.h + rm ../compute_pressure_cuda.h + rm ../compute_temp_cuda.h + rm ../compute_temp_partial_cuda.h rm ../domain_cuda.h + rm ../fft3d_cuda.h + rm ../fft3d_wrap_cuda.h + rm ../fix_addforce_cuda.h + rm ../fix_aveforce_cuda.h + rm ../fix_enforce2d_cuda.h + rm ../fix_freeze_cuda.h + rm ../fix_gravity_cuda.h + rm ../fix_nh_cuda.h + rm ../fix_npt_cuda.h + rm ../fix_nve_cuda.h + rm ../fix_nvt_cuda.h + rm ../fix_set_force_cuda.h + rm ../fix_shake_cuda.h + rm ../fix_temp_berendsen_cuda.h + rm ../fix_temp_rescale_cuda.h + rm ../fix_temp_rescale_limit_cuda.h + rm ../fix_viscous_cuda.h rm ../modify_cuda.h rm ../neighbor_cuda.h + rm ../pair_born_coul_long_cuda.h + rm ../pair_buck_coul_cut_cuda.h + rm ../pair_buck_coul_long_cuda.h + rm ../pair_buck_cuda.h + rm ../pair_cg_cmm_coul_cut_cuda.h + rm ../pair_cg_cmm_coul_debye_cuda.h + rm ../pair_cg_cmm_coul_long_cuda.h + rm ../pair_cg_cmm_cuda.h + rm ../pair_eam_alloy_cuda.h + rm ../pair_eam_cuda.h + rm ../pair_eam_fs_cuda.h + rm ../pair_gran_hooke_cuda.h + rm ../pair_lj96_cut_cuda.h + rm ../pair_lj_charmm_coul_charmm_cuda.h + rm ../pair_lj_charmm_coul_charmm_implicit_cuda.h + rm ../pair_lj_charmm_coul_long_cuda.h + rm ../pair_lj_class2_coul_cut_cuda.h + rm ../pair_lj_class2_coul_long_cuda.h + rm ../pair_lj_class2_cuda.h + rm ../pair_lj_cut_coul_cut_cuda.h + rm ../pair_lj_cut_coul_debye_cuda.h + rm ../pair_lj_cut_coul_long_cuda.h + rm ../pair_lj_cut_cuda.h + rm ../pair_lj_cut_experimental_cuda.h + rm ../pair_lj_expand_cuda.h + rm ../pair_lj_gromacs_coul_gromacs_cuda.h + rm ../pair_lj_gromacs_cuda.h + rm ../pair_lj_smooth_cuda.h + rm ../pair_morse_cuda.h + rm ../pppm_cuda.h rm ../verlet_cuda.h rm ../cuda.h diff --git a/src/USER-CUDA/atom_vec_angle_cuda.cpp b/src/USER-CUDA/atom_vec_angle_cuda.cpp new file mode 100644 index 0000000000..3064533649 --- /dev/null +++ b/src/USER-CUDA/atom_vec_angle_cuda.cpp @@ -0,0 +1,476 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "atom_vec_angle_cuda.h" +#include "comm_cuda_cu.h" +#include "atom_vec_angle_cuda_cu.h" +#include "atom.h" +#include "domain.h" +#include "modify.h" +#include "fix.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" +#include "universe.h" +#include "comm.h" + +using namespace LAMMPS_NS; + +#define DELTA 10000 +#define BUFFACTOR 1.5 +#define BUFEXTRA 1000 +#define NCUDAEXCHANGE 12 //nextra x y z vx vy vz tag type mask image molecule + +#define BUF_FLOAT double +/* ---------------------------------------------------------------------- */ + +AtomVecAngleCuda::AtomVecAngleCuda(LAMMPS *lmp, int narg, char **arg) : + AtomVecAngle(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + maxsend=0; + cudable=true; + cuda_init_done=false; + max_nsend=0; + cu_copylist=NULL; + copylist=NULL; + copylist2=NULL; +} + +void AtomVecAngleCuda::grow_copylist(int new_max_nsend) +{ + max_nsend=new_max_nsend; + delete cu_copylist; + delete [] copylist2; + if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist); + copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false); + copylist2 = new int[max_nsend]; + cu_copylist = new cCudaData (copylist, max_nsend); +} + +void AtomVecAngleCuda::grow_send(int n,double** buf_send,int flag) //need to be able to grow the comm send_buffer since the array sahll be copied from the gpu in whole +{ + int old_maxsend=*maxsend+BUFEXTRA; + *maxsend = static_cast (BUFFACTOR * n); + if (flag) + { + if(cuda->pinned) + { + double* tmp = new double[old_maxsend]; + memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double)); + if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send)); + *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false); + memcpy(*buf_send,tmp,old_maxsend*sizeof(double)); + delete [] tmp; + } + else + { + *buf_send = (double *) + memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double), + "comm:buf_send"); + } + } + else { + if(cuda->pinned) + { + if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send)); + *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false); + } + else + { + memory->sfree(*buf_send); + *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double), + "comm:buf_send"); + } + } +} + +void AtomVecAngleCuda::grow_both(int n) +{ + if(cuda->finished_setup) + cuda->downloadAll(); + AtomVecAngle::grow(n); + if(cuda->finished_setup) + { + cuda->checkResize(); + cuda->uploadAll(); + } +} + +int AtomVecAngleCuda::pack_comm(int n, int* iswap, double *buf, + int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecAngle::pack_comm(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + if((sizeof(X_FLOAT)!=sizeof(double)) && m) + m=(m+1)*sizeof(X_FLOAT)/sizeof(double); + return m; +} + +int AtomVecAngleCuda::pack_comm_vel(int n, int* iswap, double *buf, + int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecAngle::pack_comm_vel(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + if((sizeof(X_FLOAT)!=sizeof(double)) && m) + m=(m+1)*sizeof(X_FLOAT)/sizeof(double); + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecAngleCuda::unpack_comm(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecAngle::unpack_comm(n,first,buf); return;} + + Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf); +} + +void AtomVecAngleCuda::unpack_comm_vel(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecAngle::unpack_comm_vel(n,first,buf); return;} + + Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf); +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecAngleCuda::pack_reverse(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only forces are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecAngle::pack_reverse(n,first,buf); + + int i,m,last; + cuda->cu_f->download(); + m = 0; + last = first + n; + for (i = first; i < last; i++) { + buf[m++] = f[i][0]; + buf[m++] = f[i][1]; + buf[m++] = f[i][2]; + } + cuda->cu_f->upload(); + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecAngleCuda::unpack_reverse(int n, int *list, double *buf)//usually this should not be called since comm->communicate handles the communication if only forces are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecAngle::unpack_reverse(n,list,buf); return;} + + int i,j,m; + + m = 0; + cuda->cu_f->download(); + for (i = 0; i < n; i++) { + j = list[i]; + f[j][0] += buf[m++]; + f[j][1] += buf[m++]; + f[j][2] += buf[m++]; + } + cuda->cu_f->upload(); +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecAngleCuda::pack_border(int n, int *iswap, double *buf, + int pbc_flag, int *pbc) +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecAngle::pack_border(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_AtomVecAngleCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + return m; +} + +int AtomVecAngleCuda::pack_border_vel(int n, int *iswap, double *buf, + int pbc_flag, int *pbc) +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecAngle::pack_border_vel(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_AtomVecAngleCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecAngleCuda::unpack_border(int n, int first, double *buf) +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecAngle::unpack_border(n,first,buf); return;} + while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data + { + grow_both(0); + } + int flag=Cuda_AtomVecAngleCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf); + if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");} +} + +void AtomVecAngleCuda::unpack_border_vel(int n, int first, double *buf) +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecAngle::unpack_border_vel(n,first,buf); return;} + while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data + { + grow_both(0); + } + int flag=Cuda_AtomVecAngleCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf); + if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");} +} + +/* ---------------------------------------------------------------------- + pack data for atom I for sending to another proc + xyz must be 1st 3 values, so comm::exchange() can test on them +------------------------------------------------------------------------- */ + + +int AtomVecAngleCuda::pack_exchange(int dim, double *buf) +{ + if(cuda->oncpu) + return AtomVecAngle::pack_exchange(dim,buf); + + if(not cuda_init_done||domain->box_change) + { + Cuda_AtomVecAngleCuda_Init(&cuda->shared_data); + cuda_init_done=true; + } + double** buf_pointer=(double**) buf; + if(*maxsendnghost || *buf_pointer==NULL) + { + grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0); + *maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend; + } + + if(max_nsend==0) grow_copylist(200); + + int nsend_atoms = Cuda_AtomVecAngleCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer); + + if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100); + if(nsend_atoms*NCUDAEXCHANGE>*maxsend) + { + grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0); + Cuda_AtomVecAngleCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer); + } + + int nlocal=atom->nlocal-nsend_atoms; + + for(int i=0;i ((*buf_pointer)[j]); + if(i>=nlocal) copylist2[i-nlocal]=-1; + } + + int actpos=0; + for(int j=1;j ((*buf_pointer)[j]); + if(iupload(); + + cuda->shared_data.atom.nlocal=nlocal; + + int m = Cuda_AtomVecAngleCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data()); + + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + + double* buf_p=*buf_pointer; + for(int j=0;j (buf_p[j+1]); + int nextra=0; + int k; + buf_p[m++] = num_bond[i]; + for (k = 0; k < num_bond[i]; k++) { + buf_p[m++] = bond_type[i][k]; + buf_p[m++] = bond_atom[i][k]; + } + nextra+=2*num_bond[i]+1; + if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;} + + buf_p[m++] = num_angle[i]; + for (k = 0; k < num_angle[i]; k++) { + buf_p[m++] = angle_type[i][k]; + buf_p[m++] = angle_atom1[i][k]; + buf_p[m++] = angle_atom2[i][k]; + buf_p[m++] = angle_atom3[i][k]; + } + nextra+=4*num_angle[i]+1; + if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;} + + buf_p[m++] = nspecial[i][0]; + buf_p[m++] = nspecial[i][1]; + buf_p[m++] = nspecial[i][2]; + for (k = 0; k < nspecial[i][2]; k++) buf_p[m++] = special[i][k]; + nextra+=nspecial[i][2]+3; + if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;} + + if (atom->nextra_grow) + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) + { + int dm= modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf_p[m]); + m+=dm; + nextra+=dm; + if(ifix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i); + if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;} + } + + if(ishared_data.cuda_timings.comm_exchange_cpu_pack+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + (*buf_pointer)[0] = nsend_atoms; + atom->nlocal-=nsend_atoms; + cuda->shared_data.atom.update_nlocal=2; + //printf("End Pack Exchange\n"); + if(m==1) return 0; + return m; +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecAngleCuda::unpack_exchange(double *buf) +{ +// printf("Begin UnPack Exchange\n"); + if(cuda->oncpu) + return AtomVecAngle::unpack_exchange(buf); + + double *sublo,*subhi; + int dim=cuda->shared_data.exchange_dim; + if(domain->box_change) + Cuda_AtomVecAngleCuda_Init(&cuda->shared_data); + if (domain->triclinic == 0) { + sublo = domain->sublo; + subhi = domain->subhi; + } else { + sublo = domain->sublo_lamda; + subhi = domain->subhi_lamda; + } + + int mfirst=0; + for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++) + { + int nlocal = atom->nlocal; + int nsend_atoms=static_cast (buf[0]); + if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100); + + if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost); //ensure there is enough space on device to unpack data + int naccept = Cuda_AtomVecAngleCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data()); + cu_copylist->download(); + int m = nsend_atoms*NCUDAEXCHANGE + 1; + nlocal+=naccept; + + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + + for(int j=0;j-1) + { + int k; + int i=copylist[j]; + num_bond[i] = static_cast (buf[m++]); + for (k = 0; k < num_bond[i]; k++) { + bond_type[i][k] = static_cast (buf[m++]); + bond_atom[i][k] = static_cast (buf[m++]); + } + + num_angle[i] = static_cast (buf[m++]); + for (k = 0; k < num_angle[i]; k++) { + angle_type[i][k] = static_cast (buf[m++]); + angle_atom1[i][k] = static_cast (buf[m++]); + angle_atom2[i][k] = static_cast (buf[m++]); + angle_atom3[i][k] = static_cast (buf[m++]); + } + + nspecial[i][0] = static_cast (buf[m++]); + nspecial[i][1] = static_cast (buf[m++]); + nspecial[i][2] = static_cast (buf[m++]); + for (k = 0; k < nspecial[i][2]; k++) + special[i][k] = static_cast (buf[m++]); + + if (atom->nextra_grow) + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) + m += modify->fix[atom->extra_grow[iextra]]-> + unpack_exchange(i,&buf[m]); + + } + else + m+=static_cast (buf[j+1]); + } + + clock_gettime(CLOCK_REALTIME,&time2); + cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + cuda->shared_data.atom.nlocal=nlocal; + cuda->shared_data.atom.update_nlocal=2; + atom->nlocal=nlocal; + mfirst+=m; + buf=&buf[m]; + } + return mfirst; +} + + + diff --git a/src/USER-CUDA/atom_vec_angle_cuda.h b/src/USER-CUDA/atom_vec_angle_cuda.h new file mode 100644 index 0000000000..0687058aca --- /dev/null +++ b/src/USER-CUDA/atom_vec_angle_cuda.h @@ -0,0 +1,69 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef ATOM_CLASS + +AtomStyle(angle/cuda,AtomVecAngleCuda) + +#else + +#ifndef LMP_ATOM_VEC_ANGLE_CUDA_H +#define LMP_ATOM_VEC_ANGLE_CUDA_H + +#include "atom_vec_angle.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class AtomVecAngleCuda : public AtomVecAngle { + public: + AtomVecAngleCuda(class LAMMPS *, int, char **); + virtual ~AtomVecAngleCuda() {} + void grow_copylist(int n); + void grow_send(int n,double** buf_send,int flag); + void grow_both(int n); + int pack_comm(int, int *, double *, int, int *); + int pack_comm_vel(int, int *, double *, int, int *); + void unpack_comm(int, int, double *); + void unpack_comm_vel(int, int, double *); + int pack_reverse(int, int, double *); + void unpack_reverse(int, int *, double *); + int pack_border(int, int *, double *, int, int *); + int pack_border_vel(int, int *, double *, int, int *); + void unpack_border(int, int, double *); + void unpack_border_vel(int, int, double *); + int pack_exchange(int, double *); + int unpack_exchange(double *); + private: + class Cuda *cuda; + bool cuda_init_done; + int* copylist; + int* copylist2; + cCudaData* cu_copylist; + int max_nsend; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/atom_vec_atomic_cuda.cpp b/src/USER-CUDA/atom_vec_atomic_cuda.cpp new file mode 100644 index 0000000000..210d712db2 --- /dev/null +++ b/src/USER-CUDA/atom_vec_atomic_cuda.cpp @@ -0,0 +1,407 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "atom_vec_atomic_cuda.h" +#include "comm_cuda_cu.h" +#include "atom_vec_atomic_cuda_cu.h" +#include "atom.h" +#include "domain.h" +#include "modify.h" +#include "fix.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" +#include "comm.h" + +using namespace LAMMPS_NS; + +#define DELTA 10000 +#define BUFFACTOR 1.5 +#define BUFEXTRA 1000 +#define NCUDAEXCHANGE 11 //nextra x y z vx vy vz tag type mask image + + +#define BUF_FLOAT double +/* ---------------------------------------------------------------------- */ + +AtomVecAtomicCuda::AtomVecAtomicCuda(LAMMPS *lmp, int narg, char **arg) : + AtomVecAtomic(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + maxsend=0; + cudable=true; + cuda_init_done=false; + max_nsend=0; + cu_copylist=NULL; + copylist=NULL; + copylist2=NULL; +} + +void AtomVecAtomicCuda::grow_copylist(int new_max_nsend) +{ + max_nsend=new_max_nsend; + delete cu_copylist; + delete [] copylist2; + if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist); + copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false); + copylist2 = new int[max_nsend]; + cu_copylist = new cCudaData (copylist, max_nsend); +} + +void AtomVecAtomicCuda::grow_send(int n,double** buf_send,int flag) +{ + int old_maxsend=*maxsend+BUFEXTRA; + *maxsend = static_cast (BUFFACTOR * n); + if (flag) + { + if(cuda->pinned) + { + double* tmp = new double[old_maxsend]; + memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double)); + if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send)); + *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false); + memcpy(*buf_send,tmp,old_maxsend*sizeof(double)); + delete [] tmp; + } + else + { + *buf_send = (double *) + memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double), + "comm:buf_send"); + } + } + else { + if(cuda->pinned) + { + if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send)); + *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false); + } + else + { + memory->sfree(*buf_send); + *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double), + "comm:buf_send"); + } + } +} + +void AtomVecAtomicCuda::grow_both(int n) +{ + if(cuda->finished_setup) + cuda->downloadAll(); + AtomVecAtomic::grow(n); + if(cuda->finished_setup) + { + cuda->checkResize(); + cuda->uploadAll(); + } +} + +int AtomVecAtomicCuda::pack_comm(int n, int* iswap, double *buf, + int pbc_flag, int *pbc) +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecAtomic::pack_comm(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + if((sizeof(X_FLOAT)!=sizeof(double)) && m) + m=(m+1)*sizeof(X_FLOAT)/sizeof(double); + return m; +} + +int AtomVecAtomicCuda::pack_comm_vel(int n, int* iswap, double *buf, + int pbc_flag, int *pbc) +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecAtomic::pack_comm_vel(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + if((sizeof(X_FLOAT)!=sizeof(double)) && m) + m=(m+1)*sizeof(X_FLOAT)/sizeof(double); + return m; +} +/* ---------------------------------------------------------------------- */ + +void AtomVecAtomicCuda::unpack_comm(int n, int first, double *buf) +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecAtomic::unpack_comm(n,first,buf); return;} + + Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf); +} + +void AtomVecAtomicCuda::unpack_comm_vel(int n, int first, double *buf) +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecAtomic::unpack_comm_vel(n,first,buf); return;} + + Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf); +} +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicCuda::pack_reverse(int n, int first, double *buf) +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecAtomic::pack_reverse(n,first,buf); + + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + buf[m++] = f[i][0]; + buf[m++] = f[i][1]; + buf[m++] = f[i][2]; + } + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecAtomicCuda::unpack_reverse(int n, int *list, double *buf) +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecAtomic::unpack_reverse(n,list,buf); return;} + + int i,j,m; + + m = 0; + for (i = 0; i < n; i++) { + j = list[i]; + f[j][0] += buf[m++]; + f[j][1] += buf[m++]; + f[j][2] += buf[m++]; + } +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicCuda::pack_border(int n, int *iswap, double *buf, + int pbc_flag, int *pbc) +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecAtomic::pack_border(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_AtomVecAtomicCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + + return m; +} + +int AtomVecAtomicCuda::pack_border_vel(int n, int *iswap, double *buf, + int pbc_flag, int *pbc) +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecAtomic::pack_border_vel(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_AtomVecAtomicCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + + return m; +} +/* ---------------------------------------------------------------------- */ + +void AtomVecAtomicCuda::unpack_border(int n, int first, double *buf) +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecAtomic::unpack_border(n,first,buf); return;} + while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) + { + grow_both(0); + } + int flag=Cuda_AtomVecAtomicCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf); + if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");} + +} + +void AtomVecAtomicCuda::unpack_border_vel(int n, int first, double *buf) +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecAtomic::unpack_border_vel(n,first,buf); return;} + while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) + { + grow_both(0); + } + int flag=Cuda_AtomVecAtomicCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf); + if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");} +} +/* ---------------------------------------------------------------------- + pack data for atom I for sending to another proc + xyz must be 1st 3 values, so comm::exchange() can test on them +------------------------------------------------------------------------- */ + + +int AtomVecAtomicCuda::pack_exchange(int dim, double *buf) +{ + if(cuda->oncpu) + return AtomVecAtomic::pack_exchange(dim,buf); + + if(not cuda_init_done||domain->box_change) + { + Cuda_AtomVecAtomicCuda_Init(&cuda->shared_data); + cuda_init_done=true; + } + double** buf_pointer=(double**) buf; + if(*maxsendnghost || *buf_pointer==NULL) + { + grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0); + *maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend; + } + + if(max_nsend==0) grow_copylist(200); + + int nsend_atoms = Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer); + + if(nsend_atoms>max_nsend) {grow_copylist(nsend_atoms+100);} + if(nsend_atoms*NCUDAEXCHANGE>*maxsend) + { + grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0); + Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer); + } + + int nlocal=atom->nlocal-nsend_atoms; + + for(int i=0;i ((*buf_pointer)[j]); + if(i>=nlocal) copylist2[i-nlocal]=-1; + } + + int actpos=0; + for(int j=1;j ((*buf_pointer)[j]); + if(iupload(); + + cuda->shared_data.atom.nlocal=nlocal; + + int m = Cuda_AtomVecAtomicCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data()); + if (atom->nextra_grow) + for(int j=0;j ((*buf_pointer)[j+1]); + int nextra=0; + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) { + + int dm = modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&((*buf_pointer)[m])); + m+=dm; + nextra+=dm; + if(ifix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i); + if(m>*maxsend) grow_send(m,buf_pointer,1); + } + (*buf_pointer)[j+1] = nextra; + + } + + (*buf_pointer)[0] = nsend_atoms; + atom->nlocal-=nsend_atoms; + cuda->shared_data.atom.update_nlocal=2; + + if(m==1) return 0;//m is at least 1 in cuda since buf[0] contains number of atoms + return m; +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicCuda::unpack_exchange(double *buf) +{ + //printf("Unpack Begin\n"); + if(cuda->oncpu) + return AtomVecAtomic::unpack_exchange(buf); + + double *sublo,*subhi; + + int dim=cuda->shared_data.exchange_dim; + if(domain->box_change) + Cuda_AtomVecAtomicCuda_Init(&cuda->shared_data); + if (domain->triclinic == 0) { + sublo = domain->sublo; + subhi = domain->subhi; + } else { + sublo = domain->sublo_lamda; + subhi = domain->subhi_lamda; + } + + int mfirst=0; + for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++) + { + int nlocal = atom->nlocal; + + int nsend_atoms=static_cast (buf[0]); + if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100); + + if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost); + int naccept = Cuda_AtomVecAtomicCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data()); + cu_copylist->download(); + int m = nsend_atoms*NCUDAEXCHANGE + 1; + nlocal+=naccept; + if (atom->nextra_grow) + for(int j=0;j-1) + { + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) + m += modify->fix[atom->extra_grow[iextra]]-> + unpack_exchange(copylist[j],&buf[m]); + } + else + { + m+=static_cast (buf[j+1]); + } + } + cuda->shared_data.atom.nlocal=nlocal; + cuda->shared_data.atom.update_nlocal=2; + atom->nlocal=nlocal; + mfirst+=m; + buf=&buf[m]; + } + return mfirst; +} + + + diff --git a/src/USER-CUDA/atom_vec_atomic_cuda.h b/src/USER-CUDA/atom_vec_atomic_cuda.h new file mode 100644 index 0000000000..da6dfb4d3a --- /dev/null +++ b/src/USER-CUDA/atom_vec_atomic_cuda.h @@ -0,0 +1,81 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ +#ifdef ATOM_CLASS + +AtomStyle(atomic/cuda,AtomVecAtomicCuda) + +#else + +#ifndef LMP_ATOM_VEC_ATOMIC_CUDA_H +#define LMP_ATOM_VEC_ATOMIC_CUDA_H + +#include "atom_vec_atomic.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class AtomVecAtomicCuda : public AtomVecAtomic { + public: + AtomVecAtomicCuda(class LAMMPS *, int, char **); + virtual ~AtomVecAtomicCuda() {} + void grow_copylist(int n); + void grow_send(int n,double** buf_send,int flag); + void grow_both(int n); + int pack_comm(int, int *, double *, int, int *); + int pack_comm_vel(int, int *, double *, int, int *); + void unpack_comm(int, int, double *); + void unpack_comm_vel(int, int, double *); + int pack_reverse(int, int, double *); + void unpack_reverse(int, int *, double *); + int pack_border(int, int *, double *, int, int *); + int pack_border_vel(int, int *, double *, int, int *); + void unpack_border(int, int, double *); + void unpack_border_vel(int, int, double *); + int pack_exchange(int, double *); + int unpack_exchange(double *); + private: + class Cuda *cuda; + bool cuda_init_done; + int* copylist; + int* copylist2; + cCudaData* cu_copylist; + int max_nsend; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/atom_vec_charge_cuda.cpp b/src/USER-CUDA/atom_vec_charge_cuda.cpp new file mode 100644 index 0000000000..476846909a --- /dev/null +++ b/src/USER-CUDA/atom_vec_charge_cuda.cpp @@ -0,0 +1,407 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "atom_vec_charge_cuda.h" +#include "comm_cuda_cu.h" +#include "atom_vec_charge_cuda_cu.h" +#include "atom.h" +#include "domain.h" +#include "modify.h" +#include "fix.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" +#include "comm.h" + +using namespace LAMMPS_NS; + +#define DELTA 10000 +#define BUFFACTOR 1.5 +#define BUFEXTRA 1000 +#define NCUDAEXCHANGE 12 //nextra x y z vx vy vz tag type mask image q + +#define BUF_FLOAT double +/* ---------------------------------------------------------------------- */ + +AtomVecChargeCuda::AtomVecChargeCuda(LAMMPS *lmp, int narg, char **arg) : + AtomVecCharge(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + maxsend=0; + cudable=true; + cuda_init_done=false; + max_nsend=0; + cu_copylist=NULL; + copylist=NULL; + copylist2=NULL; +} + +void AtomVecChargeCuda::grow_copylist(int new_max_nsend) +{ + max_nsend=new_max_nsend; + delete cu_copylist; + delete [] copylist2; + if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist); + copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false); + copylist2 = new int[max_nsend]; + cu_copylist = new cCudaData (copylist, max_nsend); +} + +void AtomVecChargeCuda::grow_send(int n,double** buf_send,int flag) //need to be able to grow the comm send_buffer since the array sahll be copied from the gpu in whole +{ + int old_maxsend=*maxsend+BUFEXTRA; + *maxsend = static_cast (BUFFACTOR * n); + if (flag) + { + if(cuda->pinned) + { + double* tmp = new double[old_maxsend]; + memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double)); + if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send)); + *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false); + memcpy(*buf_send,tmp,old_maxsend*sizeof(double)); + delete [] tmp; + } + else + { + *buf_send = (double *) + memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double), + "comm:buf_send"); + } + } + else { + if(cuda->pinned) + { + if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send)); + *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false); + } + else + { + memory->sfree(*buf_send); + *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double), + "comm:buf_send"); + } + } +} + +void AtomVecChargeCuda::grow_both(int n) +{ + if(cuda->finished_setup) + cuda->downloadAll(); + AtomVecCharge::grow(n); + if(cuda->finished_setup) + { + cuda->checkResize(); + cuda->uploadAll(); + } +} + +int AtomVecChargeCuda::pack_comm(int n, int* iswap, double *buf, + int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecCharge::pack_comm(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + if((sizeof(X_FLOAT)!=sizeof(double)) && m) + m=(m+1)*sizeof(X_FLOAT)/sizeof(double); + return m; +} + +int AtomVecChargeCuda::pack_comm_vel(int n, int* iswap, double *buf, + int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecCharge::pack_comm_vel(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + if((sizeof(X_FLOAT)!=sizeof(double)) && m) + m=(m+1)*sizeof(X_FLOAT)/sizeof(double); + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecChargeCuda::unpack_comm(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecCharge::unpack_comm(n,first,buf); return;} + + Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf); +} + +void AtomVecChargeCuda::unpack_comm_vel(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecCharge::unpack_comm_vel(n,first,buf); return;} + + Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf); +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecChargeCuda::pack_reverse(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only forces are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecCharge::pack_reverse(n,first,buf); + + int i,m,last; + cuda->cu_f->download(); + m = 0; + last = first + n; + for (i = first; i < last; i++) { + buf[m++] = f[i][0]; + buf[m++] = f[i][1]; + buf[m++] = f[i][2]; + } + cuda->cu_f->upload(); + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecChargeCuda::unpack_reverse(int n, int *list, double *buf)//usually this should not be called since comm->communicate handles the communication if only forces are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecCharge::unpack_reverse(n,list,buf); return;} + + int i,j,m; + + m = 0; + cuda->cu_f->download(); + for (i = 0; i < n; i++) { + j = list[i]; + f[j][0] += buf[m++]; + f[j][1] += buf[m++]; + f[j][2] += buf[m++]; + } + cuda->cu_f->upload(); +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecChargeCuda::pack_border(int n, int *iswap, double *buf, + int pbc_flag, int *pbc) +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecCharge::pack_border(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_AtomVecChargeCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + + return m; +} + +int AtomVecChargeCuda::pack_border_vel(int n, int *iswap, double *buf, + int pbc_flag, int *pbc) +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecCharge::pack_border_vel(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_AtomVecChargeCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecChargeCuda::unpack_border(int n, int first, double *buf) +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecCharge::unpack_border(n,first,buf); return;} + while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data + { + grow_both(0); + } + int flag=Cuda_AtomVecChargeCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf); + if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");} +} + +void AtomVecChargeCuda::unpack_border_vel(int n, int first, double *buf) +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecCharge::unpack_border_vel(n,first,buf); return;} + while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data + { + grow_both(0); + } + int flag=Cuda_AtomVecChargeCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf); + if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");} +} + +/* ---------------------------------------------------------------------- + pack data for atom I for sending to another proc + xyz must be 1st 3 values, so comm::exchange() can test on them +------------------------------------------------------------------------- */ + + +int AtomVecChargeCuda::pack_exchange(int dim, double *buf) +{ + if(cuda->oncpu) + return AtomVecCharge::pack_exchange(dim,buf); + + if(not cuda_init_done||domain->box_change) + { + Cuda_AtomVecChargeCuda_Init(&cuda->shared_data); + cuda_init_done=true; + } + double** buf_pointer=(double**) buf; + if(*maxsendnghost || *buf_pointer==NULL) + { + grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0); + *maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend; + } + + if(max_nsend==0) grow_copylist(200); + + int nsend_atoms = Cuda_AtomVecChargeCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer); + + if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100); + if(nsend_atoms*NCUDAEXCHANGE>*maxsend) + { + grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0); + Cuda_AtomVecChargeCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer); + } + + int nlocal=atom->nlocal-nsend_atoms; + + for(int i=0;i ((*buf_pointer)[j]); + if(i>=nlocal) copylist2[i-nlocal]=-1; + } + + int actpos=0; + for(int j=1;j ((*buf_pointer)[j]); + if(iupload(); + + cuda->shared_data.atom.nlocal=nlocal; + + int m = Cuda_AtomVecChargeCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data()); + + if (atom->nextra_grow) + for(int j=0;j ((*buf_pointer)[j+1]); + int nextra=0; + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) { + + int dm = modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&((*buf_pointer)[m])); + m+=dm; + nextra+=dm; + if(ifix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i); + if(m>*maxsend) grow_send(m,buf_pointer,1); + } + (*buf_pointer)[j+1] = nextra; + } + + (*buf_pointer)[0] = nsend_atoms; + atom->nlocal-=nsend_atoms; + cuda->shared_data.atom.update_nlocal=2; + + if(m==1) return 0;//m is at least 1 in cuda since buf[0] contains number of atoms + return m; +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecChargeCuda::unpack_exchange(double *buf) +{ + if(cuda->oncpu) + return AtomVecCharge::unpack_exchange(buf); + double *sublo,*subhi; + + int dim=cuda->shared_data.exchange_dim; + if(domain->box_change) + Cuda_AtomVecChargeCuda_Init(&cuda->shared_data); + if (domain->triclinic == 0) { + sublo = domain->sublo; + subhi = domain->subhi; + } else { + sublo = domain->sublo_lamda; + subhi = domain->subhi_lamda; + } + + int mfirst=0; + for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++) + { + int nlocal = atom->nlocal; + int nsend_atoms=static_cast (buf[0]); + if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100); + + if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost); + int naccept = Cuda_AtomVecChargeCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data()); + cu_copylist->download(); + int m = nsend_atoms*NCUDAEXCHANGE + 1; + nlocal+=naccept; + if (atom->nextra_grow) + for(int j=0;j-1) + { + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) + m += modify->fix[atom->extra_grow[iextra]]-> + unpack_exchange(copylist[j],&buf[m]); + } + else + m+=static_cast (buf[j+1]); + } + cuda->shared_data.atom.nlocal=nlocal; + cuda->shared_data.atom.update_nlocal=2; + atom->nlocal=nlocal; + mfirst+=m; + buf=&buf[m]; + } + return mfirst; +} + + + diff --git a/src/USER-CUDA/atom_vec_charge_cuda.h b/src/USER-CUDA/atom_vec_charge_cuda.h new file mode 100644 index 0000000000..924dd55c85 --- /dev/null +++ b/src/USER-CUDA/atom_vec_charge_cuda.h @@ -0,0 +1,69 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef ATOM_CLASS + +AtomStyle(charge/cuda,AtomVecChargeCuda) + +#else + +#ifndef LMP_ATOM_VEC_CHARGE_CUDA_H +#define LMP_ATOM_VEC_CHARGE_CUDA_H + +#include "atom_vec_charge.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class AtomVecChargeCuda : public AtomVecCharge { + public: + AtomVecChargeCuda(class LAMMPS *, int, char **); + virtual ~AtomVecChargeCuda() {} + void grow_copylist(int n); + void grow_send(int n,double** buf_send,int flag); + void grow_both(int n); + int pack_comm(int, int *, double *, int, int *); + int pack_comm_vel(int, int *, double *, int, int *); + void unpack_comm(int, int, double *); + void unpack_comm_vel(int, int, double *); + int pack_reverse(int, int, double *); + void unpack_reverse(int, int *, double *); + int pack_border(int, int *, double *, int, int *); + int pack_border_vel(int, int *, double *, int, int *); + void unpack_border(int, int, double *); + void unpack_border_vel(int, int, double *); + int pack_exchange(int, double *); + int unpack_exchange(double *); + private: + class Cuda *cuda; + bool cuda_init_done; + int* copylist; + int* copylist2; + cCudaData* cu_copylist; + int max_nsend; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/atom_vec_full_cuda.cpp b/src/USER-CUDA/atom_vec_full_cuda.cpp new file mode 100644 index 0000000000..e81213bfef --- /dev/null +++ b/src/USER-CUDA/atom_vec_full_cuda.cpp @@ -0,0 +1,516 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "atom_vec_full_cuda.h" +#include "comm_cuda_cu.h" +#include "atom_vec_full_cuda_cu.h" +#include "atom.h" +#include "domain.h" +#include "modify.h" +#include "fix.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" +#include "universe.h" +#include "comm.h" + +using namespace LAMMPS_NS; + +#define DELTA 10000 +#define BUFFACTOR 1.5 +#define BUFEXTRA 1000 +#define NCUDAEXCHANGE 13 //nextra x y z vx vy vz tag type mask image q molecule + +#define BUF_FLOAT double +/* ---------------------------------------------------------------------- */ + +AtomVecFullCuda::AtomVecFullCuda(LAMMPS *lmp, int narg, char **arg) : + AtomVecFull(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + maxsend=0; + cudable=true; + cuda_init_done=false; + max_nsend=0; + cu_copylist=NULL; + copylist=NULL; + copylist2=NULL; +} + +void AtomVecFullCuda::grow_copylist(int new_max_nsend) +{ + max_nsend=new_max_nsend; + delete cu_copylist; + delete [] copylist2; + if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist); + copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false); + copylist2 = new int[max_nsend]; + cu_copylist = new cCudaData (copylist, max_nsend); +} + +void AtomVecFullCuda::grow_send(int n,double** buf_send,int flag) //need to be able to grow the comm send_buffer since the array sahll be copied from the gpu in whole +{ + int old_maxsend=*maxsend+BUFEXTRA; + *maxsend = static_cast (BUFFACTOR * n); + if (flag) + { + if(cuda->pinned) + { + double* tmp = new double[old_maxsend]; + memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double)); + if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send)); + *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false); + memcpy(*buf_send,tmp,old_maxsend*sizeof(double)); + delete [] tmp; + } + else + { + *buf_send = (double *) + memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double), + "comm:buf_send"); + } + } + else { + if(cuda->pinned) + { + if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send)); + *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false); + } + else + { + memory->sfree(*buf_send); + *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double), + "comm:buf_send"); + } + } +} + +void AtomVecFullCuda::grow_both(int n) +{ + if(cuda->finished_setup) + cuda->downloadAll(); + AtomVecFull::grow(n); + if(cuda->finished_setup) + { + cuda->checkResize(); + cuda->uploadAll(); + } +} + +int AtomVecFullCuda::pack_comm(int n, int* iswap, double *buf, + int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecFull::pack_comm(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + if((sizeof(X_FLOAT)!=sizeof(double)) && m) + m=(m+1)*sizeof(X_FLOAT)/sizeof(double); + return m; +} + +int AtomVecFullCuda::pack_comm_vel(int n, int* iswap, double *buf, + int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecFull::pack_comm_vel(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + if((sizeof(X_FLOAT)!=sizeof(double)) && m) + m=(m+1)*sizeof(X_FLOAT)/sizeof(double); + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecFullCuda::unpack_comm(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecFull::unpack_comm(n,first,buf); return;} + + Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf); +} + +void AtomVecFullCuda::unpack_comm_vel(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecFull::unpack_comm_vel(n,first,buf); return;} + + Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf); +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecFullCuda::pack_reverse(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only forces are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecFull::pack_reverse(n,first,buf); + + int i,m,last; + cuda->cu_f->download(); + m = 0; + last = first + n; + for (i = first; i < last; i++) { + buf[m++] = f[i][0]; + buf[m++] = f[i][1]; + buf[m++] = f[i][2]; + } + cuda->cu_f->upload(); + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecFullCuda::unpack_reverse(int n, int *list, double *buf)//usually this should not be called since comm->communicate handles the communication if only forces are exchanged +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecFull::unpack_reverse(n,list,buf); return;} + + int i,j,m; + + m = 0; + cuda->cu_f->download(); + for (i = 0; i < n; i++) { + j = list[i]; + f[j][0] += buf[m++]; + f[j][1] += buf[m++]; + f[j][2] += buf[m++]; + } + cuda->cu_f->upload(); +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecFullCuda::pack_border(int n, int *iswap, double *buf, + int pbc_flag, int *pbc) +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecFull::pack_border(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_AtomVecFullCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + return m; +} + +int AtomVecFullCuda::pack_border_vel(int n, int *iswap, double *buf, + int pbc_flag, int *pbc) +{ + if(not cuda->finished_setup || cuda->oncpu) + return AtomVecFull::pack_border_vel(n,iswap,buf,pbc_flag,pbc); + + int m = Cuda_AtomVecFullCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag); + + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecFullCuda::unpack_border(int n, int first, double *buf) +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecFull::unpack_border(n,first,buf); return;} + while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data + { + grow_both(0); + } + int flag=Cuda_AtomVecFullCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf); + if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");} +} + +void AtomVecFullCuda::unpack_border_vel(int n, int first, double *buf) +{ + if(not cuda->finished_setup || cuda->oncpu) + {AtomVecFull::unpack_border_vel(n,first,buf); return;} + while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data + { + grow_both(0); + } + int flag=Cuda_AtomVecFullCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf); + if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");} +} + +/* ---------------------------------------------------------------------- + pack data for atom I for sending to another proc + xyz must be 1st 3 values, so comm::exchange() can test on them +------------------------------------------------------------------------- */ + + +int AtomVecFullCuda::pack_exchange(int dim, double *buf) +{ + if(cuda->oncpu) + return AtomVecFull::pack_exchange(dim,buf); + + if(not cuda_init_done||domain->box_change) + { + Cuda_AtomVecFullCuda_Init(&cuda->shared_data); + cuda_init_done=true; + } + double** buf_pointer=(double**) buf; + if(*maxsendnghost || *buf_pointer==NULL) + { + grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0); + *maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend; + } + + if(max_nsend==0) grow_copylist(200); + + int nsend_atoms = Cuda_AtomVecFullCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer); + + if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100); + if(nsend_atoms*NCUDAEXCHANGE>*maxsend) + { + grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0); + Cuda_AtomVecFullCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer); + } + + int nlocal=atom->nlocal-nsend_atoms; + + for(int i=0;i ((*buf_pointer)[j]); + if(i>=nlocal) copylist2[i-nlocal]=-1; + } + + int actpos=0; + for(int j=1;j ((*buf_pointer)[j]); + if(iupload(); + + cuda->shared_data.atom.nlocal=nlocal; + + int m = Cuda_AtomVecFullCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data()); + + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + + double* buf_p=*buf_pointer; + for(int j=0;j (buf_p[j+1]); + int nextra=0; + int k; + buf_p[m++] = num_bond[i]; + for (k = 0; k < num_bond[i]; k++) { + buf_p[m++] = bond_type[i][k]; + buf_p[m++] = bond_atom[i][k]; + } + nextra+=2*num_bond[i]+1; + if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;} + + buf_p[m++] = num_angle[i]; + for (k = 0; k < num_angle[i]; k++) { + buf_p[m++] = angle_type[i][k]; + buf_p[m++] = angle_atom1[i][k]; + buf_p[m++] = angle_atom2[i][k]; + buf_p[m++] = angle_atom3[i][k]; + } + nextra+=4*num_angle[i]+1; + if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;} + + buf_p[m++] = num_dihedral[i]; + for (k = 0; k < num_dihedral[i]; k++) { + buf_p[m++] = dihedral_type[i][k]; + buf_p[m++] = dihedral_atom1[i][k]; + buf_p[m++] = dihedral_atom2[i][k]; + buf_p[m++] = dihedral_atom3[i][k]; + buf_p[m++] = dihedral_atom4[i][k]; + } + nextra+=5*num_dihedral[i]+1; + if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;} + + buf_p[m++] = num_improper[i]; + for (k = 0; k < num_improper[i]; k++) { + buf_p[m++] = improper_type[i][k]; + buf_p[m++] = improper_atom1[i][k]; + buf_p[m++] = improper_atom2[i][k]; + buf_p[m++] = improper_atom3[i][k]; + buf_p[m++] = improper_atom4[i][k]; + } + nextra+=5*num_improper[i]+1; + if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;} + + buf_p[m++] = nspecial[i][0]; + buf_p[m++] = nspecial[i][1]; + buf_p[m++] = nspecial[i][2]; + for (k = 0; k < nspecial[i][2]; k++) buf_p[m++] = special[i][k]; + nextra+=nspecial[i][2]+3; + if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;} + + if (atom->nextra_grow) + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) + { + int dm= modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf_p[m]); + m+=dm; + nextra+=dm; + if(ifix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i); + if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;} + } + + if(ishared_data.cuda_timings.comm_exchange_cpu_pack+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + (*buf_pointer)[0] = nsend_atoms; + atom->nlocal-=nsend_atoms; + cuda->shared_data.atom.update_nlocal=2; + //printf("End Pack Exchange\n"); + if(m==1) return 0; + return m; +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecFullCuda::unpack_exchange(double *buf) +{ +// printf("Begin UnPack Exchange\n"); + if(cuda->oncpu) + return AtomVecFull::unpack_exchange(buf); + + double *sublo,*subhi; + int dim=cuda->shared_data.exchange_dim; + if(domain->box_change) + Cuda_AtomVecFullCuda_Init(&cuda->shared_data); + if (domain->triclinic == 0) { + sublo = domain->sublo; + subhi = domain->subhi; + } else { + sublo = domain->sublo_lamda; + subhi = domain->subhi_lamda; + } + + int mfirst=0; + for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++) + { + int nlocal = atom->nlocal; + int nsend_atoms=static_cast (buf[0]); + if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100); + + if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost); //ensure there is enough space on device to unpack data + int naccept = Cuda_AtomVecFullCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data()); + cu_copylist->download(); + int m = nsend_atoms*NCUDAEXCHANGE + 1; + nlocal+=naccept; + + timespec time1,time2; + clock_gettime(CLOCK_REALTIME,&time1); + + for(int j=0;j-1) + { + int k; + int i=copylist[j]; + num_bond[i] = static_cast (buf[m++]); + for (k = 0; k < num_bond[i]; k++) { + bond_type[i][k] = static_cast (buf[m++]); + bond_atom[i][k] = static_cast (buf[m++]); + } + + num_angle[i] = static_cast (buf[m++]); + for (k = 0; k < num_angle[i]; k++) { + angle_type[i][k] = static_cast (buf[m++]); + angle_atom1[i][k] = static_cast (buf[m++]); + angle_atom2[i][k] = static_cast (buf[m++]); + angle_atom3[i][k] = static_cast (buf[m++]); + } + + num_dihedral[i] = static_cast (buf[m++]); + for (k = 0; k < num_dihedral[i]; k++) { + dihedral_type[i][k] = static_cast (buf[m++]); + dihedral_atom1[i][k] = static_cast (buf[m++]); + dihedral_atom2[i][k] = static_cast (buf[m++]); + dihedral_atom3[i][k] = static_cast (buf[m++]); + dihedral_atom4[i][k] = static_cast (buf[m++]); + } + + num_improper[i] = static_cast (buf[m++]); + for (k = 0; k < num_improper[i]; k++) { + improper_type[i][k] = static_cast (buf[m++]); + improper_atom1[i][k] = static_cast (buf[m++]); + improper_atom2[i][k] = static_cast (buf[m++]); + improper_atom3[i][k] = static_cast (buf[m++]); + improper_atom4[i][k] = static_cast (buf[m++]); + } + + nspecial[i][0] = static_cast (buf[m++]); + nspecial[i][1] = static_cast (buf[m++]); + nspecial[i][2] = static_cast (buf[m++]); + for (k = 0; k < nspecial[i][2]; k++) + special[i][k] = static_cast (buf[m++]); + + if (atom->nextra_grow) + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) + m += modify->fix[atom->extra_grow[iextra]]-> + unpack_exchange(i,&buf[m]); + + } + else + m+=static_cast (buf[j+1]); + } + + clock_gettime(CLOCK_REALTIME,&time2); + cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + cuda->shared_data.atom.nlocal=nlocal; + cuda->shared_data.atom.update_nlocal=2; + atom->nlocal=nlocal; + mfirst+=m; + buf=&buf[m]; + } + return mfirst; +} + + + diff --git a/src/USER-CUDA/atom_vec_full_cuda.h b/src/USER-CUDA/atom_vec_full_cuda.h new file mode 100644 index 0000000000..f16fd7703d --- /dev/null +++ b/src/USER-CUDA/atom_vec_full_cuda.h @@ -0,0 +1,69 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef ATOM_CLASS + +AtomStyle(full/cuda,AtomVecFullCuda) + +#else + +#ifndef LMP_ATOM_VEC_FULL_CUDA_H +#define LMP_ATOM_VEC_FULL_CUDA_H + +#include "atom_vec_full.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class AtomVecFullCuda : public AtomVecFull { + public: + AtomVecFullCuda(class LAMMPS *, int, char **); + virtual ~AtomVecFullCuda() {} + void grow_copylist(int n); + void grow_send(int n,double** buf_send,int flag); + void grow_both(int n); + int pack_comm(int, int *, double *, int, int *); + int pack_comm_vel(int, int *, double *, int, int *); + void unpack_comm(int, int, double *); + void unpack_comm_vel(int, int, double *); + int pack_reverse(int, int, double *); + void unpack_reverse(int, int *, double *); + int pack_border(int, int *, double *, int, int *); + int pack_border_vel(int, int *, double *, int, int *); + void unpack_border(int, int, double *); + void unpack_border_vel(int, int, double *); + int pack_exchange(int, double *); + int unpack_exchange(double *); + private: + class Cuda *cuda; + bool cuda_init_done; + int* copylist; + int* copylist2; + cCudaData* cu_copylist; + int max_nsend; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/comm_cuda.cpp b/src/USER-CUDA/comm_cuda.cpp index 6f90227112..aac1e53239 100644 --- a/src/USER-CUDA/comm_cuda.cpp +++ b/src/USER-CUDA/comm_cuda.cpp @@ -55,6 +55,8 @@ enum{SINGLE,MULTI}; CommCuda::CommCuda(LAMMPS *lmp):Comm(lmp) { cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); cu_pbc=NULL; cu_slablo=NULL; diff --git a/src/USER-CUDA/comm_cuda.cu b/src/USER-CUDA/comm_cuda.cu new file mode 100644 index 0000000000..0233f3ee13 --- /dev/null +++ b/src/USER-CUDA/comm_cuda.cu @@ -0,0 +1,483 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#define MY_PREFIX comm_cuda +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "comm_cuda_cu.h" +#include "comm_cuda_kernel.cu" +#include + +void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata,int n) +{ + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffersizebuffer,sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize=size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) + } + cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); +} + + +void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); +} + + +void Cuda_CommCuda_Init(cuda_shared_data* sdata) +{ + Cuda_CommCuda_UpdateNmax(sdata); + int ntypesp=sdata->atom.ntypes+1; + cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) , &ntypesp, sizeof(int)); + cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd, 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(debugdata) , &sdata->debugdata, sizeof(int*)); +} + +int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + + timespec time1,time2; + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; + dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; + dz = pbc[2]*sdata->domain.prd[2]; + }} + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal>0) + { + cudaMemset( sdata->flag,0,sizeof(int)); + +clock_gettime(CLOCK_REALTIME,&time1); + + void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer; + Cuda_CommCuda_PackComm_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n + ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_kernel_pack+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); + if(not sdata->overlap_comm) + cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + +clock_gettime(CLOCK_REALTIME,&time1); +sdata->cuda_timings.comm_forward_download+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + int aflag; + cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + if(aflag!=0) printf("aflag PackComm: %i\n",aflag); + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); + + } + return 3*n; +} + +int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag) +{ + + timespec time1,time2; + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*6*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; + dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; + dz = pbc[2]*sdata->domain.prd[2]; + }} + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal>0) + { + cudaMemset( sdata->flag,0,sizeof(int)); + +clock_gettime(CLOCK_REALTIME,&time1); + + void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer; + Cuda_CommCuda_PackComm_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n + ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_kernel_pack+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); + if(not sdata->overlap_comm) + cudaMemcpy(buf_send, sdata->buffer, n*6*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + +clock_gettime(CLOCK_REALTIME,&time1); +sdata->cuda_timings.comm_forward_download+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + int aflag; + cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + if(aflag!=0) printf("aflag PackComm: %i\n",aflag); + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); + + } + return 6*n; +} + +int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");) + timespec time1,time2; + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + static int count=-1; + count++; + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; + dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; + dz = pbc[2]*sdata->domain.prd[2]; + }} + + + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + +clock_gettime(CLOCK_REALTIME,&time1); + + Cuda_CommCuda_PackComm_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_kernel_self+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); + } + + return 3*n; +} + +int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) +{ + MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");) + timespec time1,time2; + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*6*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + static int count=-1; + count++; + X_FLOAT dx=0.0; + X_FLOAT dy=0.0; + X_FLOAT dz=0.0; + if (pbc_flag != 0) { + if (sdata->domain.triclinic == 0) { + dx = pbc[0]*sdata->domain.prd[0]; + dy = pbc[1]*sdata->domain.prd[1]; + dz = pbc[2]*sdata->domain.prd[2]; + } else { + dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; + dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; + dz = pbc[2]*sdata->domain.prd[2]; + }} + + + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + +clock_gettime(CLOCK_REALTIME,&time1); + + Cuda_CommCuda_PackComm_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_kernel_self+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); + } + + return 6*n; +} + +void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap) +{ + timespec time1,time2; + + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { +clock_gettime(CLOCK_REALTIME,&time1); + if(not sdata->overlap_comm||iswap<0) + cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_upload+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer; + Cuda_CommCuda_UnpackComm_Kernel<<>>(n,first,buf); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time1); +sdata->cuda_timings.comm_forward_kernel_unpack+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed"); + + } +} + +void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap) +{ + timespec time1,time2; + + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*6*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { +clock_gettime(CLOCK_REALTIME,&time1); + + if(not sdata->overlap_comm||iswap<0) + cudaMemcpy(sdata->buffer,(void*)buf_recv, n*6*sizeof(X_FLOAT), cudaMemcpyHostToDevice); + +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_forward_upload+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer; + Cuda_CommCuda_UnpackComm_Kernel<<>>(n,first,buf); + cudaThreadSynchronize(); + +clock_gettime(CLOCK_REALTIME,&time1); +sdata->cuda_timings.comm_forward_kernel_unpack+= + time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed"); + + } +} + +int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send) +{ + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(F_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + + F_FLOAT* buf=(F_FLOAT*)buf_send; + F_FLOAT* f_dev=(F_FLOAT*)sdata->atom.f.dev_data; + f_dev+=first; + cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + buf+=n; f_dev+=sdata->atom.nmax; + cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + buf+=n; f_dev+=sdata->atom.nmax; + cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + return n*3; +} + + +void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv) +{ + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(F_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + if(sdata->atom.nlocal>0) + { + cudaMemcpy(sdata->buffer,buf_recv, size, cudaMemcpyHostToDevice); + Cuda_CommCuda_UnpackReverse_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed"); + } +} + +void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first) +{ + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + int size=n*3*sizeof(X_FLOAT); + if(sdata->buffer_new or (size>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,n); + + int3 layout=getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal>0) + { + Cuda_CommCuda_UnpackReverse_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,first); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed"); + + } +} + + +int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap) +{ + MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");) + timespec time1,time2; + Cuda_CommCuda_UpdateNmax(sdata); + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + if(sdata->buffer_new or (80>sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata,10); + int n; + if (!bordergroup || ineed >= 2) + n=nlast-nfirst+1; + else + { + n=atom_nfirst; + if(nlast-sdata->atom.nlocal+1>n) n=nlast-sdata->atom.nlocal+1; + } + int3 layout=getgrid(n,0,512,true); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x+1, layout.y, 1); + + + cudaMemset((int*) (sdata->buffer),0,sizeof(int)); + +clock_gettime(CLOCK_REALTIME,&time1); + if(style==1) + Cuda_CommCuda_BuildSendlist_Single<<>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.slablo.dev_data,(X_FLOAT*) sdata->comm.slabhi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength); + else + Cuda_CommCuda_BuildSendlist_Multi<<>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.multilo.dev_data,(X_FLOAT*) sdata->comm.multihi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength); + cudaThreadSynchronize(); +clock_gettime(CLOCK_REALTIME,&time2); +sdata->cuda_timings.comm_border_kernel_buildlist+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed"); + int nsend; + cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost); + return nsend; + + +} + diff --git a/src/USER-CUDA/compute_pe_cuda.cpp b/src/USER-CUDA/compute_pe_cuda.cpp new file mode 100644 index 0000000000..0d93aea249 --- /dev/null +++ b/src/USER-CUDA/compute_pe_cuda.cpp @@ -0,0 +1,61 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include +#include "compute_pe_cuda.h" +#include "atom.h" +#include "update.h" +#include "force.h" +#include "pair.h" +#include "bond.h" +#include "angle.h" +#include "dihedral.h" +#include "improper.h" +#include "kspace.h" +#include "modify.h" +#include "domain.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +ComputePECuda::ComputePECuda(LAMMPS *lmp, int narg, char **arg) : + ComputePE(lmp, narg, arg) +{ + cudable = 1; +} diff --git a/src/USER-CUDA/compute_pe_cuda.h b/src/USER-CUDA/compute_pe_cuda.h new file mode 100644 index 0000000000..71444f671c --- /dev/null +++ b/src/USER-CUDA/compute_pe_cuda.h @@ -0,0 +1,59 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMPUTE_CLASS + +ComputeStyle(pe/cuda,ComputePECuda) + +#else + +#ifndef LMP_COMPUTE_PE_CUDA_H +#define LMP_COMPUTE_PE_CUDA_H + +#include "compute_pe.h" + +namespace LAMMPS_NS { + +class ComputePECuda : public ComputePE { + public: + ComputePECuda(class LAMMPS *, int, char **); + ~ComputePECuda() {} +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/compute_pressure_cuda.cpp b/src/USER-CUDA/compute_pressure_cuda.cpp new file mode 100644 index 0000000000..bb3e49e8e9 --- /dev/null +++ b/src/USER-CUDA/compute_pressure_cuda.cpp @@ -0,0 +1,97 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include +#include +#include "compute_pressure_cuda.h" +#include "atom.h" +#include "update.h" +#include "domain.h" +#include "modify.h" +#include "fix.h" +#include "force.h" +#include "pair.h" +#include "bond.h" +#include "angle.h" +#include "dihedral.h" +#include "improper.h" +#include "kspace.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +enum{DUMMY0,INVOKED_SCALAR,INVOKED_VECTOR,DUMMMY3,INVOKED_PERATOM}; + +/* ---------------------------------------------------------------------- */ + +ComputePressureCuda::ComputePressureCuda(LAMMPS *lmp, int narg, char **arg) : + ComputePressure(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + cudable = 1; + + // store temperature ID used by pressure computation + // insure it is valid for temperature computation + + int n = strlen(arg[3]) + 1; + char* id_temp = new char[n]; + strcpy(id_temp,arg[3]); + + int icompute = modify->find_compute(id_temp); + delete id_temp; + if (modify->compute[icompute]->cudable == 0) + { + error->warning("Compute pressure/cuda temperature ID is not cudable! Try a temp/cuda style."); + cudable = 0; + } + +} + +double ComputePressureCuda::compute_scalar() +{ + if(not temperature->cudable && cuda->finished_setup) cuda->downloadAll(); + ComputePressure::compute_scalar(); +} + +void ComputePressureCuda::compute_vector() +{ + if(not temperature->cudable && cuda->finished_setup) cuda->downloadAll(); + ComputePressure::compute_vector(); +} diff --git a/src/USER-CUDA/compute_pressure_cuda.h b/src/USER-CUDA/compute_pressure_cuda.h new file mode 100644 index 0000000000..d99f4a5cca --- /dev/null +++ b/src/USER-CUDA/compute_pressure_cuda.h @@ -0,0 +1,63 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ +#ifdef COMPUTE_CLASS + +ComputeStyle(pressure/cuda,ComputePressureCuda) + +#else + +#ifndef LMP_COMPUTE_PRESSURE_CUDA_H +#define LMP_COMPUTE_PRESSURE_CUDA_H + +#include "compute_pressure.h" + +namespace LAMMPS_NS { + +class ComputePressureCuda : public ComputePressure { + public: + ComputePressureCuda(class LAMMPS *, int, char **); + ~ComputePressureCuda() {} + double compute_scalar(); + void compute_vector(); + + private: + class Cuda *cuda; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/compute_temp_cuda.cpp b/src/USER-CUDA/compute_temp_cuda.cpp new file mode 100644 index 0000000000..a16939f95c --- /dev/null +++ b/src/USER-CUDA/compute_temp_cuda.cpp @@ -0,0 +1,212 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include +#include +#include +#include "compute_temp_cuda.h" +#include "compute_temp_cuda_cu.h" +#include "atom.h" +#include "update.h" +#include "force.h" +#include "domain.h" +#include "modify.h" +#include "fix.h" +#include "group.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +ComputeTempCuda::ComputeTempCuda(LAMMPS *lmp, int narg, char **arg) : + Compute(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (narg != 3) error->all("Illegal compute temp/cuda command"); + + scalar_flag = vector_flag = 1; + size_vector = 6; + extscalar = 0; + extvector = 1; + tempflag = 1; + + vector = new double[6]; + cu_t_vector = 0; + cu_t_scalar = 0; + cudable=true; + +} + +/* ---------------------------------------------------------------------- */ + +ComputeTempCuda::~ComputeTempCuda() +{ + delete [] vector; + delete cu_t_vector; + delete cu_t_scalar; +} + +/* ---------------------------------------------------------------------- */ + +void ComputeTempCuda::init() +{ + fix_dof = 0; + for (int i = 0; i < modify->nfix; i++) + fix_dof += modify->fix[i]->dof(igroup); + dof_compute(); +} + +/* ---------------------------------------------------------------------- */ + +void ComputeTempCuda::dof_compute() +{ + double natoms = group->count(igroup); + dof = domain->dimension * natoms; + dof -= extra_dof + fix_dof; + if (dof > 0.0) tfactor = force->mvv2e / (dof * force->boltz); + else tfactor = 0.0; +} + +/* ---------------------------------------------------------------------- */ + +double ComputeTempCuda::compute_scalar() +{ + if(cuda->begin_setup) + { + if(not cu_t_vector) cu_t_vector = new cCudaData (t_vector,6); + if(not cu_t_scalar) cu_t_scalar = new cCudaData (&t_scalar,1); + invoked_scalar = update->ntimestep; + Cuda_ComputeTempCuda_Scalar(&cuda->shared_data,groupbit,(ENERGY_FLOAT*) cu_t_scalar->dev_data()); + cu_t_scalar->download(); + } + else + { + invoked_scalar = update->ntimestep; + + double **v = atom->v; + double *mass = atom->mass; + double *rmass = atom->rmass; + int *type = atom->type; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + double t = 0.0; + + if (rmass) { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * rmass[i]; + } else { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * + mass[type[i]]; + } + t_scalar=t; + } + + MPI_Allreduce(&t_scalar,&scalar,1,MPI_DOUBLE,MPI_SUM,world); + if (dynamic) dof_compute(); + scalar *= tfactor; + if(scalar>1e15) + { + cuda->cu_v->download(); + cuda->cu_x->download(); + cuda->cu_type->download(); + double **v = atom->v; + double **x = atom->x; + printf("Out of v-range atoms: \n"); + for(int i=0;inlocal;i++) + if((v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2])>1e5) + printf("%i %i // %lf %lf %lf // %lf %lf %lf\n",atom->tag[i],atom->type[i],x[i][0], x[i][1], x[i][2],v[i][0], v[i][1], v[i][2]); + error->all("Temperature out of range. Simulations will be abortet.\n"); + } + return scalar; +} + +/* ---------------------------------------------------------------------- */ + +void ComputeTempCuda::compute_vector() +{ + int i; + if(cuda->begin_setup) + { + if(not cu_t_vector) cu_t_vector = new cCudaData (t_vector,6); + if(not cu_t_scalar) cu_t_scalar = new cCudaData (&t_scalar,1); + + invoked_vector = update->ntimestep; + + Cuda_ComputeTempCuda_Vector(&cuda->shared_data,groupbit,(ENERGY_FLOAT*) cu_t_vector->dev_data()); + cu_t_vector->download(); + } + else + { + + invoked_vector = update->ntimestep; + + double **v = atom->v; + double *mass = atom->mass; + double *rmass = atom->rmass; + int *type = atom->type; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + double massone,t[6]; + for (i = 0; i < 6; i++) t[i] = 0.0; + + for (i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + if (rmass) massone = rmass[i]; + else massone = mass[type[i]]; + t[0] += massone * v[i][0]*v[i][0]; + t[1] += massone * v[i][1]*v[i][1]; + t[2] += massone * v[i][2]*v[i][2]; + t[3] += massone * v[i][0]*v[i][1]; + t[4] += massone * v[i][0]*v[i][2]; + t[5] += massone * v[i][1]*v[i][2]; + } + + for (i = 0; i < 6; i++) t_vector[i]=t[i]; + } + MPI_Allreduce(t_vector,vector,6,MPI_DOUBLE,MPI_SUM,world); + for (i = 0; i < 6; i++) vector[i] *= force->mvv2e; +} diff --git a/src/USER-CUDA/compute_temp_cuda.h b/src/USER-CUDA/compute_temp_cuda.h new file mode 100644 index 0000000000..35ae0bbf3f --- /dev/null +++ b/src/USER-CUDA/compute_temp_cuda.h @@ -0,0 +1,75 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMPUTE_CLASS + +ComputeStyle(temp/cuda,ComputeTempCuda) + +#else + +#ifndef LMP_COMPUTE_TEMP_CUDA_H +#define LMP_COMPUTE_TEMP_CUDA_H + +#include "compute.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class ComputeTempCuda : public Compute { + public: + ComputeTempCuda(class LAMMPS *, int, char **); + ~ComputeTempCuda(); + void init(); + double compute_scalar(); + void compute_vector(); + + private: + class Cuda *cuda; + int fix_dof; + double tfactor; + + void dof_compute(); + double t_vector[6]; + double t_scalar; + cCudaData* cu_t_scalar; + cCudaData* cu_t_vector; + +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/compute_temp_partial_cuda.cpp b/src/USER-CUDA/compute_temp_partial_cuda.cpp new file mode 100644 index 0000000000..2965e273cd --- /dev/null +++ b/src/USER-CUDA/compute_temp_partial_cuda.cpp @@ -0,0 +1,357 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include +#include +#include +#include "compute_temp_partial_cuda.h" +#include "compute_temp_partial_cuda_cu.h" +#include "atom.h" +#include "update.h" +#include "force.h" +#include "domain.h" +#include "modify.h" +#include "fix.h" +#include "group.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +ComputeTempPartialCuda::ComputeTempPartialCuda(LAMMPS *lmp, int narg, char **arg) : + Compute(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (narg != 6) error->all("Illegal compute temp/partial command"); + + scalar_flag = vector_flag = 1; + size_vector = 6; + extscalar = 0; + extvector = 1; + tempflag = 1; + tempbias = 1; + + xflag = atoi(arg[3]); + yflag = atoi(arg[4]); + zflag = atoi(arg[5]); + if (zflag && domain->dimension == 2) + error->all("Compute temp/partial cannot use vz for 2d systemx"); + + maxbias = 0; + vbiasall = NULL; + + vector = new double[6]; + cu_t_vector = 0; + cu_t_scalar = 0; + cu_vbiasall=NULL; + cudable=true; + +} + +/* ---------------------------------------------------------------------- */ + +ComputeTempPartialCuda::~ComputeTempPartialCuda() +{ + memory->destroy(vbiasall); + delete [] vector; + delete cu_t_vector; + delete cu_t_scalar; + delete cu_vbiasall; +} + +/* ---------------------------------------------------------------------- */ + +void ComputeTempPartialCuda::init() +{ + fix_dof = 0; + for (int i = 0; i < modify->nfix; i++) + fix_dof += modify->fix[i]->dof(igroup); + dof_compute(); +} + +/* ---------------------------------------------------------------------- */ + +void ComputeTempPartialCuda::dof_compute() +{ + double natoms = group->count(igroup); + int nper = xflag+yflag+zflag; + dof = nper * natoms; + dof -= (1.0*nper/domain->dimension)*fix_dof + extra_dof; + if (dof > 0) tfactor = force->mvv2e / (dof * force->boltz); + else tfactor = 0.0; +} + +/* ---------------------------------------------------------------------- */ + +int ComputeTempPartialCuda::dof_remove(int i) +{ + int nper = xflag+yflag+zflag; + return (domain->dimension - nper); +} + +/* ---------------------------------------------------------------------- */ + +double ComputeTempPartialCuda::compute_scalar() +{ + if(cuda->begin_setup) + { + if(not cu_t_vector) cu_t_vector = new cCudaData (t_vector,6); + if(not cu_t_scalar) cu_t_scalar = new cCudaData (&t_scalar,1); + invoked_scalar = update->ntimestep; + Cuda_ComputeTempPartialCuda_Scalar(&cuda->shared_data,groupbit,(ENERGY_FLOAT*) cu_t_scalar->dev_data(),xflag,yflag,zflag); + cu_t_scalar->download(); + } + else + { + invoked_scalar = update->ntimestep; + + double **v = atom->v; + double *mass = atom->mass; + double *rmass = atom->rmass; + int *type = atom->type; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + double t = 0.0; + + if (rmass) { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + t += (xflag*v[i][0]*v[i][0] + yflag*v[i][1]*v[i][1] + zflag*v[i][2]*v[i][2]) * rmass[i]; + } else { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + t += (xflag*v[i][0]*v[i][0] + yflag*v[i][1]*v[i][1] + zflag*v[i][2]*v[i][2]) * + mass[type[i]]; + } + t_scalar=t; + } + + MPI_Allreduce(&t_scalar,&scalar,1,MPI_DOUBLE,MPI_SUM,world); + if (dynamic) dof_compute(); + scalar *= tfactor; + if(scalar>1e15) + { + cuda->cu_v->download(); + cuda->cu_x->download(); + cuda->cu_type->download(); + double **v = atom->v; + double **x = atom->x; + printf("Out of v-range atoms: \n"); + for(int i=0;inlocal;i++) + if((v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2])>1e5) + printf("%i %i // %lf %lf %lf // %lf %lf %lf\n",atom->tag[i],atom->type[i],x[i][0], x[i][1], x[i][2],v[i][0], v[i][1], v[i][2]); + error->all("Temperature out of range. Simulations will be abortet.\n"); + } + return scalar; +} + +/* ---------------------------------------------------------------------- */ + +void ComputeTempPartialCuda::compute_vector() +{ + int i; + if(cuda->begin_setup) + { + if(not cu_t_vector) cu_t_vector = new cCudaData (t_vector,6); + if(not cu_t_scalar) cu_t_scalar = new cCudaData (&t_scalar,1); + + invoked_vector = update->ntimestep; + + Cuda_ComputeTempPartialCuda_Vector(&cuda->shared_data,groupbit,(ENERGY_FLOAT*) cu_t_vector->dev_data(),xflag,yflag,zflag); + cu_t_vector->download(); + } + else + { + + invoked_vector = update->ntimestep; + + double **v = atom->v; + double *mass = atom->mass; + double *rmass = atom->rmass; + int *type = atom->type; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + double massone,t[6]; + for (i = 0; i < 6; i++) t[i] = 0.0; + + for (i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + if (rmass) massone = rmass[i]; + else massone = mass[type[i]]; + t[0] += massone * xflag*v[i][0]*v[i][0]; + t[1] += massone * yflag*v[i][1]*v[i][1]; + t[2] += massone * zflag*v[i][2]*v[i][2]; + t[3] += massone * xflag*yflag*v[i][0]*v[i][1]; + t[4] += massone * xflag*zflag*v[i][0]*v[i][2]; + t[5] += massone * yflag*zflag*v[i][1]*v[i][2]; + } + + for (i = 0; i < 6; i++) t_vector[i]=t[i]; + } + MPI_Allreduce(t_vector,vector,6,MPI_DOUBLE,MPI_SUM,world); + for (i = 0; i < 6; i++) vector[i] *= force->mvv2e; +} + +/* ---------------------------------------------------------------------- + remove velocity bias from atom I to leave thermal velocity +------------------------------------------------------------------------- */ + +void ComputeTempPartialCuda::remove_bias(int i, double *v) +{ + if (!xflag) { + vbias[0] = v[0]; + v[0] = 0.0; + } + if (!yflag) { + vbias[1] = v[1]; + v[1] = 0.0; + } + if (!zflag) { + vbias[2] = v[2]; + v[2] = 0.0; + } +} + +/* ---------------------------------------------------------------------- + remove velocity bias from all atoms to leave thermal velocity +------------------------------------------------------------------------- */ + +void ComputeTempPartialCuda::remove_bias_all() +{ + double **v = atom->v; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + if (nlocal > maxbias) { + memory->destroy(vbiasall); + maxbias = atom->nmax; + memory->create(vbiasall,maxbias,3,"temp/partial:vbiasall"); + delete cu_vbiasall; + cu_vbiasall = new cCudaData ((double*)vbiasall, atom->nmax, 3); + } + if(cuda->begin_setup) + { + Cuda_ComputeTempPartialCuda_RemoveBiasAll(&cuda->shared_data,groupbit,xflag,yflag,zflag,cu_vbiasall->dev_data()); + } + else + { + if (!xflag) { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + vbiasall[i][0] = v[i][0]; + v[i][0] = 0.0; + } + } + if (!yflag) { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + vbiasall[i][1] = v[i][1]; + v[i][1] = 0.0; + } + } + if (!zflag) { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + vbiasall[i][2] = v[i][2]; + v[i][2] = 0.0; + } + } + } +} + +/* ---------------------------------------------------------------------- + add back in velocity bias to atom I removed by remove_bias() + assume remove_bias() was previously called +------------------------------------------------------------------------- */ + +void ComputeTempPartialCuda::restore_bias(int i, double *v) +{ + if (!xflag) v[0] += vbias[0]; + if (!yflag) v[1] += vbias[1]; + if (!zflag) v[2] += vbias[2]; +} + +/* ---------------------------------------------------------------------- + add back in velocity bias to all atoms removed by remove_bias_all() + assume remove_bias_all() was previously called +------------------------------------------------------------------------- */ + +void ComputeTempPartialCuda::restore_bias_all() +{ + double **v = atom->v; + int *mask = atom->mask; + int nlocal = atom->nlocal; + if(cuda->begin_setup) + { + Cuda_ComputeTempPartialCuda_RestoreBiasAll(&cuda->shared_data,groupbit,xflag,yflag,zflag,cu_vbiasall->dev_data()); + } + else + { + + if (!xflag) { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + v[i][0] += vbiasall[i][0]; + } + if (!yflag) { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + v[i][1] += vbiasall[i][1]; + } + if (!zflag) { + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + v[i][2] += vbiasall[i][2]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +double ComputeTempPartialCuda::memory_usage() +{ + double bytes = maxbias * sizeof(double); + return bytes; +} diff --git a/src/USER-CUDA/compute_temp_partial_cuda.h b/src/USER-CUDA/compute_temp_partial_cuda.h new file mode 100644 index 0000000000..4412adc88a --- /dev/null +++ b/src/USER-CUDA/compute_temp_partial_cuda.h @@ -0,0 +1,83 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMPUTE_CLASS + +ComputeStyle(temp/partial/cuda,ComputeTempPartialCuda) + +#else + +#ifndef LMP_COMPUTE_TEMP_PARTIAL_CUDA_H +#define LMP_COMPUTE_TEMP_PARTIAL_CUDA_H + +#include "compute.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class ComputeTempPartialCuda : public Compute { + public: + ComputeTempPartialCuda(class LAMMPS *, int, char **); + ~ComputeTempPartialCuda(); + void init(); + double compute_scalar(); + void compute_vector(); + + int dof_remove(int); + void remove_bias(int, double *); + void remove_bias_all(); + void restore_bias(int, double *); + void restore_bias_all(); + double memory_usage(); + + private: + class Cuda *cuda; + int xflag,yflag,zflag; + int fix_dof; + double tfactor; + + void dof_compute(); + double t_vector[6]; + double t_scalar; + cCudaData* cu_t_scalar; + cCudaData* cu_t_vector; + cCudaData* cu_vbiasall; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp index f5ff1ea72d..66273775e7 100644 --- a/src/USER-CUDA/cuda.cpp +++ b/src/USER-CUDA/cuda.cpp @@ -230,7 +230,7 @@ void Cuda::accelerator(int narg, char** arg) { if(++i==narg) error->all("Invalid Options for 'accelerator' command. Expecting a string after 'suffix' option."); - strcpy(lmp->asuffix,arg[i]); + strcpy(lmp->suffix,arg[i]); } if(strcmp(arg[i],"overlap_comm")==0) { diff --git a/src/USER-CUDA/cuda_neigh_list.cpp b/src/USER-CUDA/cuda_neigh_list.cpp index e6d7a6f516..5715d8cb8b 100644 --- a/src/USER-CUDA/cuda_neigh_list.cpp +++ b/src/USER-CUDA/cuda_neigh_list.cpp @@ -29,12 +29,16 @@ #include #include "cuda.h" #include "atom.h" +#include "error.h" using namespace LAMMPS_NS; CudaNeighList::CudaNeighList(LAMMPS *lmp, class NeighList* neigh_list) : Pointers(lmp) { cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... start\n");) this->neigh_list = neigh_list; neigh_list->cuda_list=this; diff --git a/src/USER-CUDA/domain_cuda.cpp b/src/USER-CUDA/domain_cuda.cpp index fc8d8bb498..438b47b28c 100644 --- a/src/USER-CUDA/domain_cuda.cpp +++ b/src/USER-CUDA/domain_cuda.cpp @@ -54,6 +54,8 @@ enum{NO_REMAP,X_REMAP,V_REMAP}; // same as fix_deform.cpp DomainCuda::DomainCuda(LAMMPS *lmp) : Domain(lmp) { cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); } /* ---------------------------------------------------------------------- */ diff --git a/src/USER-CUDA/fft3d_cuda.cpp b/src/USER-CUDA/fft3d_cuda.cpp new file mode 100644 index 0000000000..bb1278bb75 --- /dev/null +++ b/src/USER-CUDA/fft3d_cuda.cpp @@ -0,0 +1,608 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Jim Shepherd (GA Tech) added SGI SCSL support +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include +#include +#include +#include "fft3d_cuda.h" +#include "fft3d_cuda_cu.h" +#include "remap.h" +#include +#include "cuda_wrapper_cu.h" + +#ifdef FFT_CUFFT +#endif +#define MIN(A,B) ((A) < (B)) ? (A) : (B) +#define MAX(A,B) ((A) > (B)) ? (A) : (B) + +/* ---------------------------------------------------------------------- + Data layout for 3d FFTs: + + data set of Nfast x Nmid x Nslow elements is owned by P procs + on input, each proc owns a subsection of the elements + on output, each proc will own a (possibly different) subsection + my subsection must not overlap with any other proc's subsection, + i.e. the union of all proc's input (or output) subsections must + exactly tile the global Nfast x Nmid x Nslow data set + when called from C, all subsection indices are + C-style from 0 to N-1 where N = Nfast or Nmid or Nslow + when called from F77, all subsection indices are + F77-style from 1 to N where N = Nfast or Nmid or Nslow + a proc can own 0 elements on input or output + by specifying hi index < lo index + on both input and output, data is stored contiguously on a processor + with a fast-varying, mid-varying, and slow-varying index +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Perform 3d FFT + + Arguments: + in starting address of input data on this proc + out starting address of where output data for this proc + will be placed (can be same as in) + flag 1 for forward FFT, -1 for inverse FFT + plan plan returned by previous call to fft_3d_create_plan +------------------------------------------------------------------------- */ + +void fft_3d_cuda(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan) +{ +#ifdef FFT_CUFFT + plan->iterate++; + timespec starttime,starttime2; + timespec endtime,endtime2; + + int i,total,length,offset,num; + double norm; + FFT_DATA *data,*copy; + // system specific constants + + + // pre-remap to prepare for 1st FFTs if needed + // copy = loc for remap result + int nprocs=plan->nprocs; +if(nprocs>1) +{ + if(plan->init) + clock_gettime(CLOCK_REALTIME,&starttime); + if (plan->pre_plan) { + if (plan->pre_target == 0) copy = out; + else copy = plan->copy; + if(plan->init) remap_3d((double *) in, (double *) out, (double *) plan->scratch,plan->pre_plan); + data = out; + } + else + data = in; +} + cufftResult retvalc; + if(plan->init) + { + if(nprocs>1) + { + if(sizeof(FFT_FLOAT)==sizeof(double))cudaMemcpy((void*) (plan->cudata2), (void*) data, plan->cudatasize/2,cudaMemcpyHostToDevice); + if(sizeof(FFT_FLOAT)==sizeof(float)) cudaMemcpy((void*) (plan->cudata2), (void*) data, plan->cudatasize,cudaMemcpyHostToDevice); + initfftdata((double*)plan->cudata2,(FFT_FLOAT*)plan->cudata,plan->nfast,plan->nmid,plan->nslow); + } + } + if (flag == -1) + { + retvalc=cufft(plan->plan_3d, plan->cudata, plan->cudata2,CUFFT_FORWARD); + } + else + { + retvalc=cufft(plan->plan_3d, plan->cudata, plan->cudata2,CUFFT_INVERSE); + } + if(retvalc!=CUFFT_SUCCESS) {printf("ErrorCUFFT: %i\n",retvalc);exit(EXIT_FAILURE);} + + FFTsyncthreads(); +#endif +} +/* ---------------------------------------------------------------------- + Create plan for performing a 3d FFT + + Arguments: + comm MPI communicator for the P procs which own the data + nfast,nmid,nslow size of global 3d matrix + in_ilo,in_ihi input bounds of data I own in fast index + in_jlo,in_jhi input bounds of data I own in mid index + in_klo,in_khi input bounds of data I own in slow index + out_ilo,out_ihi output bounds of data I own in fast index + out_jlo,out_jhi output bounds of data I own in mid index + out_klo,out_khi output bounds of data I own in slow index + scaled 0 = no scaling of result, 1 = scaling + permute permutation in storage order of indices on output + 0 = no permutation + 1 = permute once = mid->fast, slow->mid, fast->slow + 2 = permute twice = slow->fast, fast->mid, mid->slow + nbuf returns size of internal storage buffers used by FFT +------------------------------------------------------------------------- */ + +struct fft_plan_3d *fft_3d_create_plan_cuda( + MPI_Comm comm, int nfast, int nmid, int nslow, + int in_ilo, int in_ihi, int in_jlo, int in_jhi, + int in_klo, int in_khi, + int out_ilo, int out_ihi, int out_jlo, int out_jhi, + int out_klo, int out_khi, + int scaled, int permute, int *nbuf,bool ainit) +{ +#ifdef FFT_CUFFT + struct fft_plan_3d *plan; + int me,nprocs; + int i,num,flag,remapflag,fftflag; + int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; + int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; + int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi; + int out_size,first_size,second_size,third_size,copy_size,scratch_size; + int np1,np2,ip1,ip2; + int list[50]; + + // system specific variables + + // query MPI info + + MPI_Comm_rank(comm,&me); + MPI_Comm_size(comm,&nprocs); + +#ifndef FFT_CUFFT + error->all("ERROR: Trying to use cuda fft without FFT_CUFFT set. Recompile with make option 'cufft=1'."); +#endif + // compute division of procs in 2 dimensions not on-processor + bifactor_cuda(nprocs,&np1,&np2); + ip1 = me % np1; + ip2 = me/np1; + + // in case of CUDA FFT every proc does the full FFT in order to avoid data transfers (the problem is other wise heavily bandwidth limited) + + int ip1out = ip1; + int ip2out = ip2; + int np1out = np1; + int np2out = np2; + + ip1 = 0; + ip2 = 0; + np1 = 1; + np2 = 1; + + // allocate memory for plan data struct + + plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d)); + if (plan == NULL) return NULL; + plan->init=ainit; + + // remap from initial distribution to layout needed for 1st set of 1d FFTs + // not needed if all procs own entire fast axis initially + // first indices = distribution after 1st set of FFTs + + if (in_ilo == 0 && in_ihi == nfast-1) + flag = 0; + else + flag = 1; + + if(nprocs>1)flag=1; + + MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); + + if (remapflag == 0) { + first_ilo = in_ilo; + first_ihi = in_ihi; + first_jlo = in_jlo; + first_jhi = in_jhi; + first_klo = in_klo; + first_khi = in_khi; + plan->pre_plan = NULL; + } + else { + first_ilo = 0; + first_ihi = nfast - 1; + first_jlo = ip1*nmid/np1; + first_jhi = (ip1+1)*nmid/np1 - 1; + first_klo = ip2*nslow/np2; + first_khi = (ip2+1)*nslow/np2 - 1; + int members=2; + if(plan->init) members=1; + plan->pre_plan = + remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, + first_ilo,first_ihi,first_jlo,first_jhi, + first_klo,first_khi, + members,0,0,2); + if (plan->pre_plan == NULL) return NULL; + } + + // 1d FFTs along fast axis + + plan->length1 = nfast; + plan->total1 = nfast * nmid * nslow; + + // remap from 1st to 2nd FFT + // choose which axis is split over np1 vs np2 to minimize communication + // second indices = distribution after 2nd set of FFTs + + second_ilo = ip1*nfast/np1; + second_ihi = (ip1+1)*nfast/np1 - 1; + second_jlo = 0; + second_jhi = nmid - 1; + second_klo = ip2*nslow/np2; + second_khi = (ip2+1)*nslow/np2 - 1; + plan->mid1_plan = + remap_3d_create_plan(comm, + first_ilo,first_ihi,first_jlo,first_jhi, + first_klo,first_khi, + second_ilo,second_ihi,second_jlo,second_jhi, + second_klo,second_khi, + 2,1,0,2); + if (plan->mid1_plan == NULL) return NULL; + + // 1d FFTs along mid axis + + plan->length2 = nmid; + plan->total2 = nfast * nmid * nslow; + + // remap from 2nd to 3rd FFT + // if final distribution is permute=2 with all procs owning entire slow axis + // then this remapping goes directly to final distribution + // third indices = distribution after 3rd set of FFTs + + flag=1; + + MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); + + if (remapflag == 0) { + third_ilo = out_ilo; + third_ihi = out_ihi; + third_jlo = out_jlo; + third_jhi = out_jhi; + third_klo = out_klo; + third_khi = out_khi; + } + else { + third_ilo = ip1*nfast/np1; + third_ihi = (ip1+1)*nfast/np1 - 1; + third_jlo = ip2*nmid/np2; + third_jhi = (ip2+1)*nmid/np2 - 1; + third_klo = 0; + third_khi = nslow - 1; + } + + plan->mid2_plan = + remap_3d_create_plan(comm, + second_jlo,second_jhi,second_klo,second_khi, + second_ilo,second_ihi, + third_jlo,third_jhi,third_klo,third_khi, + third_ilo,third_ihi, + 2,1,0,2); + if (plan->mid2_plan == NULL) return NULL; + + // 1d FFTs along slow axis + + plan->length3 = nslow; + plan->total3 = nfast * nmid * nslow; + + // remap from 3rd FFT to final distribution + // not needed if permute = 2 and third indices = out indices on all procs + + flag=1; + + MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); + + if (remapflag == 0) + plan->post_plan = NULL; + else { + plan->post_plan = + remap_3d_create_plan(comm, + third_klo,third_khi,third_ilo,third_ihi, + third_jlo,third_jhi, + out_klo,out_khi,out_ilo,out_ihi, + out_jlo,out_jhi, + 2,(permute+1)%3,0,2); + if (plan->post_plan == NULL) return NULL; + } + + // configure plan memory pointers and allocate work space + // out_size = amount of memory given to FFT by user + // first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps + // copy_size = amount needed internally for extra copy of data + // scratch_size = amount needed internally for remap scratch space + // for each remap: + // out space used for result if big enough, else require copy buffer + // accumulate largest required remap scratch space + + out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); + first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * + (first_khi-first_klo+1); + second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * + (second_khi-second_klo+1); + third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * + (third_khi-third_klo+1); + + plan->ihi_out=out_ihi; + plan->ilo_out=out_ilo; + plan->jhi_out=out_jhi; + plan->jlo_out=out_jlo; + plan->khi_out=out_khi; + plan->klo_out=out_klo; + + copy_size = 0; + scratch_size = 0; + + if (plan->pre_plan) { + if (first_size <= out_size) + plan->pre_target = 0; + else { + plan->pre_target = 1; + copy_size = MAX(copy_size,first_size); + } + scratch_size = MAX(scratch_size,first_size); + } + + if (plan->mid1_plan) { + if (second_size <= out_size) + plan->mid1_target = 0; + else { + plan->mid1_target = 1; + copy_size = MAX(copy_size,second_size); + } + scratch_size = MAX(scratch_size,second_size); + } + + if (plan->mid2_plan) { + if (third_size <= out_size) + plan->mid2_target = 0; + else { + plan->mid2_target = 1; + copy_size = MAX(copy_size,third_size); + } + scratch_size = MAX(scratch_size,third_size); + } + + if (plan->post_plan) + scratch_size = MAX(scratch_size,out_size); + + *nbuf = copy_size + scratch_size; + + if (copy_size) { + plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA)); + if (plan->copy == NULL) return NULL; + } + else plan->copy = NULL; + + if (scratch_size) { + plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA)); + if (plan->scratch == NULL) return NULL; + } + else plan->scratch = NULL; + + // system specific pre-computation of 1d FFT coeffs + // and scaling normalization + + cufftResult retvalc; + int nfft = (in_ihi-in_ilo+1) * (in_jhi-in_jlo+1) * + (in_khi-in_klo+1); + int nfft_brick = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * + (out_khi-out_klo+1); + + int nfft_both = MAX(nfft,nfft_brick); + nfft_both=nfast*nmid*nslow; + + plan->cudatasize=nfft_both*sizeof(FFT_DATA); + + //retvalc=cufftPlan1d(&(plan->plan_fast), nfast, CUFFT_PLAN,plan->total1/nfast); + //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT1: %i\n",retvalc); + plan->nfast=nfast; + + //retvalc=cufftPlan1d(&(plan->plan_mid), nmid, CUFFT_PLAN,plan->total2/nmid); + //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT2: %i\n",retvalc); + plan->nmid=nmid; + + //retvalc=cufftPlan1d(&(plan->plan_slow), nslow, CUFFT_PLAN,plan->total3/nslow); + //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc); + plan->nslow=nslow; + + retvalc=cufftPlan3d(&(plan->plan_3d), nslow,nmid,nfast, CUFFT_PLAN); + if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc); + + plan->nprocs=nprocs; + plan->me=me; + if (scaled == 0) + plan->scaled = 0; + else { + plan->scaled = 1; + plan->norm = 1.0/(nfast*nmid*nslow); + plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * + (out_khi-out_klo+1); + } + + plan->coretime=0; + plan->iterate=0; + plan->ffttime=0; + return plan; + #endif +} + +/* ---------------------------------------------------------------------- + Destroy a 3d fft plan +------------------------------------------------------------------------- */ + +void fft_3d_destroy_plan_cuda(struct fft_plan_3d *plan) +{ +#ifdef FFT_CUFFT + if (plan->pre_plan) remap_3d_destroy_plan(plan->pre_plan); + if (plan->mid1_plan) remap_3d_destroy_plan(plan->mid1_plan); + if (plan->mid2_plan) remap_3d_destroy_plan(plan->mid2_plan); + if (plan->post_plan) remap_3d_destroy_plan(plan->post_plan); + + if (plan->copy) free(plan->copy); + if (plan->scratch) free(plan->scratch); + + + //cufftDestroy(plan->plan_fast); + //cufftDestroy(plan->plan_mid); + //cufftDestroy(plan->plan_slow); + cufftDestroy(plan->plan_3d); + free(plan); +#endif +} + +/* ---------------------------------------------------------------------- + recursively divide n into small factors, return them in list +------------------------------------------------------------------------- */ + +void factor_cuda(int n, int *num, int *list) +{ + if (n == 1) { + return; + } + else if (n % 2 == 0) { + *list = 2; + (*num)++; + factor_cuda(n/2,num,list+1); + } + else if (n % 3 == 0) { + *list = 3; + (*num)++; + factor_cuda(n/3,num,list+1); + } + else if (n % 5 == 0) { + *list = 5; + (*num)++; + factor_cuda(n/5,num,list+1); + } + else if (n % 7 == 0) { + *list = 7; + (*num)++; + factor_cuda(n/7,num,list+1); + } + else if (n % 11 == 0) { + *list = 11; + (*num)++; + factor_cuda(n/11,num,list+1); + } + else if (n % 13 == 0) { + *list = 13; + (*num)++; + factor_cuda(n/13,num,list+1); + } + else { + *list = n; + (*num)++; + return; + } +} + +/* ---------------------------------------------------------------------- + divide n into 2 factors of as equal size as possible +------------------------------------------------------------------------- */ + +void bifactor_cuda(int n, int *factor1, int *factor2) +{ + int n1,n2,facmax; + + facmax = static_cast (sqrt((double) n)); + + for (n1 = facmax; n1 > 0; n1--) { + n2 = n/n1; + if (n1*n2 == n) { + *factor1 = n1; + *factor2 = n2; + return; + } + } +} + +/* ---------------------------------------------------------------------- + perform just the 1d FFTs needed by a 3d FFT, no data movement + used for timing purposes + + Arguments: + in starting address of input data on this proc, all set to 0.0 + nsize size of in + flag 1 for forward FFT, -1 for inverse FFT + plan plan returned by previous call to fft_3d_create_plan +------------------------------------------------------------------------- */ + +void fft_1d_only_cuda(FFT_DATA *data, int nsize, int flag, struct fft_plan_3d *plan) +{ +#ifdef FFT_CUFFT + int i,total,length,offset,num; + double norm; + + // system specific constants + + + + // total = size of data needed in each dim + // length = length of 1d FFT in each dim + // total/length = # of 1d FFTs in each dim + // if total > nsize, limit # of 1d FFTs to available size of data + + int total1 = plan->total1; + int length1 = plan->length1; + int total2 = plan->total2; + int length2 = plan->length2; + int total3 = plan->total3; + int length3 = plan->length3; + + if (total1 > nsize) total1 = (nsize/length1) * length1; + if (total2 > nsize) total2 = (nsize/length2) * length2; + if (total3 > nsize) total3 = (nsize/length3) * length3; + + // perform 1d FFTs in each of 3 dimensions + // data is just an array of 0.0 + + + cudaMemcpy((void**) &(plan->cudata), (void*) data, plan->cudatasize,cudaMemcpyHostToDevice); + if (flag == -1) { + cufft(plan->plan_3d, plan->cudata, plan->cudata,CUFFT_FORWARD); + /*cufft(plan->plan_fast, plan->cudata, plan->cudata,CUFFT_FORWARD); + cufft(plan->plan_mid, plan->cudata, plan->cudata,CUFFT_FORWARD); + cufft(plan->plan_slow, plan->cudata, plan->cudata,CUFFT_FORWARD);*/ + } else { + cufft(plan->plan_3d, plan->cudata, plan->cudata,CUFFT_FORWARD); + /*cufft(plan->plan_fast, plan->cudata, plan->cudata,CUFFT_INVERSE); + cufft(plan->plan_mid,plan->cudata, plan->cudata,CUFFT_INVERSE); + cufft(plan->plan_slow, plan->cudata, plan->cudata,CUFFT_INVERSE);*/ + } + cudaMemcpy((void*) data, (void**) &(plan->cudata), plan->cudatasize,cudaMemcpyDeviceToHost); + + // scaling if required + // limit num to size of data + +#endif +} diff --git a/src/USER-CUDA/fft3d_cuda.h b/src/USER-CUDA/fft3d_cuda.h new file mode 100644 index 0000000000..648d7d6584 --- /dev/null +++ b/src/USER-CUDA/fft3d_cuda.h @@ -0,0 +1,148 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +// User-settable FFT precision + +// FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag) +// FFT_PRECISION = 2 is double-precision complex (8-byte real, 8-byte imag) +#include "cuda_precision.h" +//#define FFT_PRECISION 2 + +// ------------------------------------------------------------------------- + +// Data types for single-precision complex + +#if FFT_PRECISION_CU == 1 + +#ifdef FFT_CUFFT +#include "cuda_runtime.h" +#include "cufft.h" +typedef struct { + float re; + float im; +} FFT_DATA; +typedef cufftComplex cufftData; +typedef cufftReal cufftDataInit; +#define cufft cufftExecC2C +#define cufftinit cufftExecR2C +#define CUFFT_PLAN CUFFT_C2C +#define CUFFT_PLAN_INIT CUFFT_R2C +#else +typedef struct { + float re; + float im; +} FFT_DATA; +#endif + +#endif + +// ------------------------------------------------------------------------- + +// Data types for double-precision complex + +#if FFT_PRECISION_CU == 2 + + +#ifdef FFT_CUFFT +#include "cuda_runtime.h" +#include "cufft.h" +typedef cufftDoubleComplex cufftData; +typedef cufftDoubleReal cufftDataInit; +typedef struct { + double re; + double im; +} FFT_DATA; +#define cufft cufftExecZ2Z +#define cufftinit cufftExecD2Z +#define CUFFT_PLAN CUFFT_Z2Z +#define CUFFT_PLAN_INIT CUFFT_D2Z +#endif + +#endif + +// ------------------------------------------------------------------------- + +// details of how to do a 3d FFT + +struct fft_plan_3d { + struct remap_plan_3d *pre_plan; // remap from input -> 1st FFTs + struct remap_plan_3d *mid1_plan; // remap from 1st -> 2nd FFTs + struct remap_plan_3d *mid2_plan; // remap from 2nd -> 3rd FFTs + struct remap_plan_3d *post_plan; // remap from 3rd FFTs -> output + FFT_DATA *copy; // memory for remap results (if needed) + FFT_DATA *scratch; // scratch space for remaps + int total1,total2,total3; // # of 1st,2nd,3rd FFTs (times length) + int length1,length2,length3; // length of 1st,2nd,3rd FFTs + int pre_target; // where to put remap results + int mid1_target,mid2_target; + int scaled; // whether to scale FFT results + int normnum; // # of values to rescale + double norm; // normalization factor for rescaling + + double coretime; + double ffttime; + int iterate; + // system specific 1d FFT info + +#ifdef FFT_CUFFT + //CUdeviceptr cudata; + cufftData* cudata; + cufftData* cudata2; + unsigned int cudatasize; + cufftHandle plan_fast; + cufftHandle plan_mid; + cufftHandle plan_slow; + cufftHandle plan_3d; + int nfast; + int nmid; + int nslow; + int ihi_out,ilo_out,jhi_out,jlo_out,khi_out,klo_out; + int me,nprocs; +#endif + int init; +}; + +// function prototypes + +void fft_3d_destroy_plan_cuda(struct fft_plan_3d *); +void factor_cuda(int, int *, int *); +void bifactor_cuda(int, int *, int *); +void fft_1d_only_cuda(FFT_DATA *, int, int, struct fft_plan_3d *); +void fft_3d_cudaA(FFT_DATA *, FFT_DATA *, int, struct fft_plan_3d *); +void fft_3d_cuda(FFT_DATA *, FFT_DATA *, int, struct fft_plan_3d *); +struct fft_plan_3d *fft_3d_create_plan_cuda(MPI_Comm, int, int, int, + int, int, int, int, int, int, int, int, int, int, int, int, + int, int, int *,bool init); diff --git a/src/USER-CUDA/fft3d_wrap_cuda.cpp b/src/USER-CUDA/fft3d_wrap_cuda.cpp new file mode 100644 index 0000000000..5fa45bd85c --- /dev/null +++ b/src/USER-CUDA/fft3d_wrap_cuda.cpp @@ -0,0 +1,111 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include "fft3d_wrap_cuda.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +FFT3dCuda::FFT3dCuda(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow, + int in_ilo, int in_ihi, int in_jlo, int in_jhi, + int in_klo, int in_khi, + int out_ilo, int out_ihi, int out_jlo, int out_jhi, + int out_klo, int out_khi, + int scaled, int permute, int *nbuf,bool init) : Pointers(lmp) +{ +#ifdef FFT_CUFFT + plan = fft_3d_create_plan_cuda(comm,nfast,nmid,nslow, + in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, + out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, + scaled,permute,nbuf,init); +#endif +#ifndef FFT_CUFFT + plan = fft_3d_create_plan(comm,nfast,nmid,nslow, + in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, + out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, + scaled,permute,nbuf); +#endif + if (plan == NULL) error->one("Could not create 3d FFT plan"); +} + +/* ---------------------------------------------------------------------- */ + +FFT3dCuda::~FFT3dCuda() +{ +#ifdef FFT_CUFFT + fft_3d_destroy_plan_cuda(plan); +#endif +#ifndef FFT_CUFFT + fft_3d_destroy_plan(plan); +#endif +} + +/* ---------------------------------------------------------------------- */ + +void FFT3dCuda::compute(double *in, double *out, int flag) +{ +#ifdef FFT_CUFFT + fft_3d_cuda((FFT_DATA *) in,(FFT_DATA *) out,flag,plan); +#endif +#ifndef FFT_CUFFT + fft_3d((FFT_DATA *) in,(FFT_DATA *) out,flag,plan); +#endif +} + +/* ---------------------------------------------------------------------- */ + +void FFT3dCuda::timing1d(double *in, int nsize, int flag) +{ +#ifdef FFT_CUFFT + fft_1d_only_cuda((FFT_DATA *) in,nsize,flag,plan); +#endif +#ifndef FFT_CUFFT + fft_1d_only((FFT_DATA *) in,nsize,flag,plan); +#endif +} + +#ifdef FFT_CUFFT +void FFT3dCuda::set_cudata(void* cudata,void* cudata2) +{ + + plan->cudata=(cufftData*) cudata; + plan->cudata2=(cufftData*) cudata2; + +} +#endif diff --git a/src/USER-CUDA/fft3d_wrap_cuda.h b/src/USER-CUDA/fft3d_wrap_cuda.h new file mode 100644 index 0000000000..911057cbec --- /dev/null +++ b/src/USER-CUDA/fft3d_wrap_cuda.h @@ -0,0 +1,68 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef FFT3D_WRAP_CUDA_H_ +#define FFT3D_WRAP_CUDA_H_ + +#include "pointers.h" + +#ifdef FFT_CUFFT + #include "fft3d_cuda.h" +#endif +#ifndef FFT_CUFFT + #include "fft3d.h" +#endif + +namespace LAMMPS_NS { + +class FFT3dCuda : protected Pointers { + public: + FFT3dCuda(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,int,int,int, + int,int,int,int,int,int,int,int,int *,bool); + ~FFT3dCuda(); + void compute(double *, double *, int); + void timing1d(double *, int, int); + +#ifdef FFT_CUFFT + void set_cudata(void* cudata,void* cudata2); +#endif + private: + struct fft_plan_3d *plan; +}; + +} + +#endif /*FFT3D_WRAP_CUDA_H_*/ diff --git a/src/USER-CUDA/fix_addforce_cuda.cpp b/src/USER-CUDA/fix_addforce_cuda.cpp new file mode 100644 index 0000000000..a259068365 --- /dev/null +++ b/src/USER-CUDA/fix_addforce_cuda.cpp @@ -0,0 +1,190 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + +#include +#include +#include "fix_addforce_cuda.h" +#include "fix_addforce_cuda_cu.h" +#include "atom.h" +#include "update.h" +#include "respa.h" +#include "error.h" +#include "domain.h" +#include "cuda.h" +#include "memory.h" +#include "cuda_modify_flags.h" + + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +FixAddForceCuda::FixAddForceCuda(LAMMPS *lmp, int narg, char **arg) : + Fix(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (narg < 6) error->all("Illegal fix addforce/cuda command"); + + scalar_flag = 1; + vector_flag = 1; + size_vector = 3; + global_freq = 1; + extscalar = 1; + extvector = 1; + + xvalue = atof(arg[3]); + yvalue = atof(arg[4]); + zvalue = atof(arg[5]); + + // optional args + + iregion = -1; + + int iarg = 6; + while (iarg < narg) { + if (strcmp(arg[iarg],"region") == 0) { + if (iarg+2 > narg) error->all("Illegal fix addforce/cuda command"); + iregion = domain->find_region(arg[iarg+1]); + if (iregion == -1) error->all("Fix addforce/cuda region ID does not exist"); + iarg += 2; + } else error->all("Illegal fix addforce/cuda command"); + } + + if(iregion!=-1) error->all("Error: fix addforce/cuda does not currently support 'region' option"); + + force_flag = 0; + foriginal[0] = foriginal[1] = foriginal[2] = foriginal[3] = 0.0; + cu_foriginal = NULL; +} + +/* ---------------------------------------------------------------------- */ + +int FixAddForceCuda::setmask() +{ + int mask = 0; + mask |= POST_FORCE_CUDA; + mask |= THERMO_ENERGY_CUDA; + mask |= POST_FORCE_RESPA; + mask |= MIN_POST_FORCE_CUDA; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixAddForceCuda::init() +{ + if(not cu_foriginal) + cu_foriginal = new cCudaData (foriginal,4); + if (strcmp(update->integrate_style,"respa") == 0) + nlevels_respa = ((Respa *) update->integrate)->nlevels; +} + +/* ---------------------------------------------------------------------- */ + +void FixAddForceCuda::setup(int vflag) +{ + MYDBG( printf("# CUDA: FixAddForceCuda::setup\n"); ) + + if (strcmp(update->integrate_style,"verlet") == 0) + { + Cuda_FixAddForceCuda_Init(&cuda->shared_data); + cuda->cu_f->upload(); + post_force(vflag); + cuda->cu_f->download(); + + } + else { + ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1); + cuda->cu_f->download(); + post_force_respa(vflag,nlevels_respa-1,0); + cuda->cu_f->upload(); + ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1); + } + MYDBG( printf("# CUDA: FixAddForceCuda::setup done\n"); ) +} + +/* ---------------------------------------------------------------------- */ + +void FixAddForceCuda::min_setup(int vflag) +{ + post_force(vflag); +} + +/* ---------------------------------------------------------------------- */ + +void FixAddForceCuda::post_force(int vflag) +{ + MYDBG( printf("# CUDA: FixAddForceCuda::postforce start\n"); ) + force_flag = 0; + cu_foriginal->memset_device(0); + Cuda_FixAddForceCuda_PostForce(&cuda->shared_data, groupbit, xvalue, yvalue,zvalue,(F_FLOAT*) cu_foriginal->dev_data()); + cu_foriginal->download(); +} + +/* ---------------------------------------------------------------------- */ + +void FixAddForceCuda::post_force_respa(int vflag, int ilevel, int iloop) +{ + if (ilevel == nlevels_respa-1) post_force(vflag); +} + +/* ---------------------------------------------------------------------- */ + +void FixAddForceCuda::min_post_force(int vflag) +{ + post_force(vflag); +} + +/* ---------------------------------------------------------------------- + potential energy of added force +------------------------------------------------------------------------- */ + +double FixAddForceCuda::compute_scalar() +{ + // only sum across procs one time + + if (force_flag == 0) { + MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world); + force_flag = 1; + } + return foriginal_all[0]; +} + +/* ---------------------------------------------------------------------- + return components of total force on fix group before force was changed +------------------------------------------------------------------------- */ + +double FixAddForceCuda::compute_vector(int n) +{ + // only sum across procs one time + + if (force_flag == 0) { + MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world); + force_flag = 1; + } + return foriginal_all[n+1]; +} diff --git a/src/USER-CUDA/fix_addforce_cuda.h b/src/USER-CUDA/fix_addforce_cuda.h new file mode 100644 index 0000000000..38efa0528d --- /dev/null +++ b/src/USER-CUDA/fix_addforce_cuda.h @@ -0,0 +1,64 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(addforce/cuda,FixAddForceCuda) + +#else + +#ifndef LMP_FIX_ADD_FORCE_CUDA_H +#define LMP_FIX_ADD_FORCE_CUDA_H + +#include "fix.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class FixAddForceCuda : public Fix { + public: + FixAddForceCuda(class LAMMPS *, int, char **); + int setmask(); + void init(); + void setup(int); + void min_setup(int); + void post_force(int); + void post_force_respa(int, int, int); + void min_post_force(int); + double compute_scalar(); + double compute_vector(int); + + private: + class Cuda *cuda; + int iregion; + double xvalue,yvalue,zvalue; + double foriginal[4],foriginal_all[4]; + cCudaData* cu_foriginal; + int force_flag; + int nlevels_respa; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_aveforce_cuda.cpp b/src/USER-CUDA/fix_aveforce_cuda.cpp new file mode 100644 index 0000000000..8d2b4ddd0f --- /dev/null +++ b/src/USER-CUDA/fix_aveforce_cuda.cpp @@ -0,0 +1,229 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + +#include "mpi.h" +#include +#include +#include "fix_aveforce_cuda.h" +#include "fix_aveforce_cuda_cu.h" +#include "atom.h" +#include "update.h" +#include "respa.h" +#include "error.h" +#include "domain.h" +#include "cuda.h" +#include "cuda_modify_flags.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +FixAveForceCuda::FixAveForceCuda(LAMMPS *lmp, int narg, char **arg) : + Fix(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (narg != 6) error->all("Illegal fix aveforce command"); + + vector_flag = 1; + size_vector = 3; + global_freq = 1; + extvector = 1; + + xflag = yflag = zflag = 1; + if (strcmp(arg[3],"NULL") == 0) xflag = 0; + else xvalue = atof(arg[3]); + if (strcmp(arg[4],"NULL") == 0) yflag = 0; + else yvalue = atof(arg[4]); + if (strcmp(arg[5],"NULL") == 0) zflag = 0; + else zvalue = atof(arg[5]); + + // optional args + + iregion = -1; + + int iarg = 6; + while (iarg < narg) { + if (strcmp(arg[iarg],"region") == 0) { + if (iarg+2 > narg) error->all("Illegal fix aveforce command"); + iregion = domain->find_region(arg[iarg+1]); + if (iregion == -1) error->all("Fix aveforce region ID does not exist"); + iarg += 2; + } else error->all("Illegal fix aveforce command"); + + } + + if(iregion!=-1) error->all("Error: fix aveforce/cuda does not currently support 'region' option"); + + foriginal_all[0] = foriginal_all[1] = foriginal_all[2] = foriginal_all[3] = 0.0; + foriginal[0] = foriginal[1] = foriginal[2] = foriginal[3] = 0.0; + cu_foriginal = NULL; + +} + +/* ---------------------------------------------------------------------- */ + +int FixAveForceCuda::setmask() +{ + int mask = 0; + mask |= POST_FORCE_CUDA; + mask |= POST_FORCE_RESPA; + mask |= MIN_POST_FORCE_CUDA; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixAveForceCuda::init() +{ + if(not cu_foriginal) + cu_foriginal = new cCudaData (foriginal,4); + if (strcmp(update->integrate_style,"respa") == 0) + nlevels_respa = ((Respa *) update->integrate)->nlevels; + + // ncount = total # of atoms in group + + int *mask = atom->mask; + int nlocal = atom->nlocal; +} + +/* ---------------------------------------------------------------------- */ + +void FixAveForceCuda::setup(int vflag) +{ + if (strcmp(update->integrate_style,"verlet") == 0) + { + Cuda_FixAveForceCuda_Init(&cuda->shared_data); + cuda->cu_f->upload(); + post_force(vflag); + cuda->cu_f->download(); + + } + else + { + cuda->cu_f->download(); + for (int ilevel = 0; ilevel < nlevels_respa; ilevel++) { + ((Respa *) update->integrate)->copy_flevel_f(ilevel); + post_force_respa(vflag,ilevel,0); + ((Respa *) update->integrate)->copy_f_flevel(ilevel); + } + cuda->cu_f->upload(); + } +} + +/* ---------------------------------------------------------------------- */ + +void FixAveForceCuda::min_setup(int vflag) +{ + post_force(vflag); +} + +/* ---------------------------------------------------------------------- */ + +void FixAveForceCuda::post_force(int vflag) +{ + // sum forces on participating atoms + + cu_foriginal->memset_device(0); + Cuda_FixAveForceCuda_PostForce_FOrg(&cuda->shared_data, groupbit,(F_FLOAT*) cu_foriginal->dev_data()); + cu_foriginal->download(); + + // average the force on participating atoms + // add in requested amount + + MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world); + int ncount = static_cast (foriginal_all[3]); + if (ncount == 0) return; + double fave[3]; + fave[0] = foriginal_all[0]/ncount + xvalue; + fave[1] = foriginal_all[1]/ncount + yvalue; + fave[2] = foriginal_all[2]/ncount + zvalue; + + // set force of all participating atoms to same value + // only for active dimensions + + Cuda_FixAveForceCuda_PostForce_Set(&cuda->shared_data, groupbit,xflag,yflag,zflag,fave[0],fave[1],fave[2]); +} + +/* ---------------------------------------------------------------------- */ + +void FixAveForceCuda::post_force_respa(int vflag, int ilevel, int iloop) +{ + // ave + extra force on outermost level + // just ave on inner levels + if (ilevel == nlevels_respa-1) post_force(vflag); + else { + cuda->cu_f->download(); + cuda->cu_mask->download(); + double **f = atom->f; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + double foriginal[4]; + foriginal[0] = foriginal[1] = foriginal[2] = foriginal[3] = 0.0; + + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + foriginal[0] += f[i][0]; + foriginal[1] += f[i][1]; + foriginal[2] += f[i][2]; + foriginal[3] += 1; + + } + + MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world); + int ncount = static_cast (foriginal_all[3]); + if (ncount == 0) return; + double fave[3]; + fave[0] = foriginal_all[0]/ncount; + fave[1] = foriginal_all[1]/ncount; + fave[2] = foriginal_all[2]/ncount; + + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + if (xflag) f[i][0] = fave[0]; + if (yflag) f[i][1] = fave[1]; + if (zflag) f[i][2] = fave[2]; + } + cuda->cu_f->upload(); + } +} + +/* ---------------------------------------------------------------------- */ + +void FixAveForceCuda::min_post_force(int vflag) +{ + post_force(vflag); +} + +/* ---------------------------------------------------------------------- + return components of total force on fix group before force was changed +------------------------------------------------------------------------- */ + +double FixAveForceCuda::compute_vector(int n) +{ + return foriginal_all[n]; +} diff --git a/src/USER-CUDA/fix_aveforce_cuda.h b/src/USER-CUDA/fix_aveforce_cuda.h new file mode 100644 index 0000000000..987ee9d996 --- /dev/null +++ b/src/USER-CUDA/fix_aveforce_cuda.h @@ -0,0 +1,64 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(aveforce/cuda,FixAveForceCuda) + +#else + + +#ifndef LMP_FIX_AVE_FORCE_CUDA_H +#define LMP_FIX_AVE_FORCE_CUDA_H + +#include "fix.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class FixAveForceCuda : public Fix { + public: + FixAveForceCuda(class LAMMPS *, int, char **); + int setmask(); + void init(); + void setup(int); + void min_setup(int); + void post_force(int); + void post_force_respa(int, int, int); + void min_post_force(int); + double compute_vector(int); + + private: + class Cuda *cuda; + int xflag,yflag,zflag,iregion; + double xvalue,yvalue,zvalue; + double foriginal_all[4]; + double foriginal[4]; + cCudaData* cu_foriginal; + int nlevels_respa; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_enforce2d_cuda.cpp b/src/USER-CUDA/fix_enforce2d_cuda.cpp new file mode 100644 index 0000000000..b9ac0341ad --- /dev/null +++ b/src/USER-CUDA/fix_enforce2d_cuda.cpp @@ -0,0 +1,169 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include "fix_enforce2d_cuda.h" +#include "fix_enforce2d_cuda_cu.h" +#include "atom.h" +#include "update.h" +#include "domain.h" +#include "respa.h" +#include "error.h" +#include "cuda.h" +#include "cuda_modify_flags.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +FixEnforce2DCuda::FixEnforce2DCuda(LAMMPS *lmp, int narg, char **arg) : + Fix(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (narg != 3) error->all("Illegal fix enforce2d command"); +} + +/* ---------------------------------------------------------------------- */ + +int FixEnforce2DCuda::setmask() +{ + int mask = 0; + mask |= POST_FORCE_CUDA; + mask |= POST_FORCE_RESPA; + mask |= MIN_POST_FORCE_CUDA; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixEnforce2DCuda::init() +{ + if (domain->dimension == 3) + error->all("Cannot use fix enforce2d/cuda with 3d simulation"); + if (atom->omega_flag) + error->warning("Enforce2d/cuda does not support omega_flag on gpu yet. Will be handled on cpu."); + + if (atom->angmom_flag) + error->warning("Enforce2d/cuda does not support angmom_flag (angular momentum) on gpu yet. Will be handled on cpu."); + + if (atom->torque_flag) + error->warning("Enforce2d/cuda does not support torque_flag on gpu yet. Will be handled on cpu."); +} + +/* ---------------------------------------------------------------------- */ + +void FixEnforce2DCuda::setup(int vflag) +{ + if (strcmp(update->integrate_style,"verlet") == 0) + { + Cuda_FixEnforce2dCuda_Init(&cuda->shared_data); + cuda->cu_f->upload(); + cuda->cu_v->upload(); + post_force(vflag); + cuda->cu_f->download(); + cuda->cu_v->download(); + } + else { + int nlevels_respa = ((Respa *) update->integrate)->nlevels; + for (int ilevel = 0; ilevel < nlevels_respa; ilevel++) { + ((Respa *) update->integrate)->copy_flevel_f(ilevel); + post_force_respa(vflag,ilevel,0); + ((Respa *) update->integrate)->copy_f_flevel(ilevel); + } + } +} + +/* ---------------------------------------------------------------------- */ + +void FixEnforce2DCuda::min_setup(int vflag) +{ + post_force(vflag); +} + +/* ---------------------------------------------------------------------- */ + +void FixEnforce2DCuda::post_force(int vflag) +{ + Cuda_FixEnforce2dCuda_PostForce(&cuda->shared_data, groupbit); + + int *mask = atom->mask; + int nlocal = atom->nlocal; + if (igroup == atom->firstgroup) nlocal = atom->nfirst; + + if (atom->omega_flag) { + double **omega = atom->omega; + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + omega[i][0] = 0.0; + omega[i][1] = 0.0; + } + } + + if (atom->angmom_flag) { + double **angmom = atom->angmom; + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + angmom[i][0] = 0.0; + angmom[i][1] = 0.0; + } + } + + if (atom->torque_flag) { + double **torque = atom->torque; + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + torque[i][0] = 0.0; + torque[i][1] = 0.0; + } + } +} + +/* ---------------------------------------------------------------------- */ + +void FixEnforce2DCuda::post_force_respa(int vflag, int ilevel, int iloop) +{ + post_force(vflag); +} + +/* ---------------------------------------------------------------------- */ + +void FixEnforce2DCuda::min_post_force(int vflag) +{ + post_force(vflag); +} diff --git a/src/USER-CUDA/fix_enforce2d_cuda.h b/src/USER-CUDA/fix_enforce2d_cuda.h new file mode 100644 index 0000000000..2abb1ffa18 --- /dev/null +++ b/src/USER-CUDA/fix_enforce2d_cuda.h @@ -0,0 +1,55 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(enforce2d/cuda,FixEnforce2DCuda) + +#else + +#ifndef LMP_FIX_ENFORCE2D_CUDA_H +#define LMP_FIX_ENFORCE2D_CUDA_H + +#include "fix.h" + +namespace LAMMPS_NS { + +class FixEnforce2DCuda : public Fix { + public: + FixEnforce2DCuda(class LAMMPS *, int, char **); + int setmask(); + void init(); + void setup(int); + void min_setup(int); + void post_force(int); + void post_force_respa(int, int, int); + void min_post_force(int); + + private: + class Cuda *cuda; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_freeze_cuda.cpp b/src/USER-CUDA/fix_freeze_cuda.cpp new file mode 100644 index 0000000000..c13dc02cdc --- /dev/null +++ b/src/USER-CUDA/fix_freeze_cuda.cpp @@ -0,0 +1,135 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#include +#include +#include "fix_freeze_cuda.h" +#include "fix_freeze_cuda_cu.h" +#include "atom.h" +#include "update.h" +#include "respa.h" +#include "error.h" +#include "cuda.h" +#include "memory.h" +#include "modify.h" +#include "cuda_modify_flags.h" + + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +FixFreezeCuda::FixFreezeCuda(LAMMPS *lmp, int narg, char **arg) : + Fix(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (narg != 3) error->all("Illegal fix freeze command"); + + if (!atom->torque_flag) + error->all("Fix freeze requires atom attribute torque"); + + vector_flag = 1; + size_vector = 3; + global_freq = 1; + extvector = 1; + + + + force_flag = 0; + foriginal[0] = foriginal[1] = foriginal[2] = 0.0; + cu_foriginal=NULL; +} + +/* ---------------------------------------------------------------------- */ + +int FixFreezeCuda::setmask() +{ + int mask = 0; + mask |= POST_FORCE_CUDA; + mask |= THERMO_ENERGY_CUDA; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixFreezeCuda::init() +{ + if(not cu_foriginal) + cu_foriginal = new cCudaData (foriginal,3); + int count = 0; + for (int i = 0; i < modify->nfix; i++) + if (strcmp(modify->fix[i]->style,"freeze") == 0) count++; + if (count > 1) error->all("More than one fix freeze"); +} + +/* ---------------------------------------------------------------------- */ + +void FixFreezeCuda::setup(int vflag) +{ + MYDBG( printf("# CUDA: FixFreezeCuda::setup\n"); ) + + if (strcmp(update->integrate_style,"verlet") == 0) + { + Cuda_FixFreezeCuda_Init(&cuda->shared_data); + cuda->cu_f->upload(); + post_force(vflag); + cuda->cu_f->download(); + + } + + MYDBG( printf("# CUDA: FixFreezeCuda::setup done\n"); ) +} + +/* ---------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- */ + +void FixFreezeCuda::post_force(int vflag) +{ + MYDBG( printf("# CUDA: FixFreezeCuda::postforce start\n"); ) + force_flag = 0; + cu_foriginal->memset_device(0); + Cuda_FixFreezeCuda_PostForce(&cuda->shared_data, groupbit, (F_FLOAT*) cu_foriginal->dev_data()); + cu_foriginal->download(); +} + +/* ---------------------------------------------------------------------- */ + + + +/* ---------------------------------------------------------------------- + return components of total force on fix group before force was changed +------------------------------------------------------------------------- */ + +double FixFreezeCuda::compute_vector(int n) +{ + // only sum across procs one time + + if (force_flag == 0) { + MPI_Allreduce(foriginal,foriginal_all,3,MPI_DOUBLE,MPI_SUM,world); + force_flag = 1; + } + return foriginal_all[n+1]; +} diff --git a/src/USER-CUDA/fix_freeze_cuda.h b/src/USER-CUDA/fix_freeze_cuda.h new file mode 100644 index 0000000000..019301096c --- /dev/null +++ b/src/USER-CUDA/fix_freeze_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(freeze/cuda,FixFreezeCuda) + +#else + +#ifndef LMP_FIX_FREEZE_CUDA_H +#define LMP_FIX_FREEZE_CUDA_H + +#include "fix.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class FixFreezeCuda : public Fix { + public: + FixFreezeCuda(class LAMMPS *, int, char **); + int setmask(); + void init(); + void setup(int); + void post_force(int); + double compute_vector(int); + + private: + class Cuda *cuda; + double foriginal[3],foriginal_all[3]; + cCudaData* cu_foriginal; + int force_flag; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_gravity_cuda.cpp b/src/USER-CUDA/fix_gravity_cuda.cpp new file mode 100644 index 0000000000..650e9f7ae8 --- /dev/null +++ b/src/USER-CUDA/fix_gravity_cuda.cpp @@ -0,0 +1,181 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#include +#include +#include +#include +#include "fix_gravity_cuda.h" +#include "fix_gravity_cuda_cu.h" +#include "atom.h" +#include "update.h" +#include "domain.h" +#include "respa.h" +#include "error.h" +#include "cuda.h" +#include "cuda_modify_flags.h" + + +using namespace LAMMPS_NS; + +enum{CHUTE,SPHERICAL,GRADIENT,VECTOR}; + +/* ---------------------------------------------------------------------- */ + +FixGravityCuda::FixGravityCuda(LAMMPS *lmp, int narg, char **arg) : + Fix(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (narg < 5) error->all("Illegal fix gravity command"); + + time_depend = 1; + + magnitude = atof(arg[3]); + + if (strcmp(arg[4],"chute") == 0) { + if (narg != 6) error->all("Illegal fix gravity command"); + style = CHUTE; + phi = 0.0; + theta = 180.0 - atof(arg[5]); + } else if (strcmp(arg[4],"spherical") == 0) { + if (narg != 7) error->all("Illegal fix gravity command"); + style = SPHERICAL; + phi = atof(arg[5]); + theta = atof(arg[6]); + } else if (strcmp(arg[4],"gradient") == 0) { + if (narg != 9) error->all("Illegal fix gravity command"); + style = GRADIENT; + phi = atof(arg[5]); + theta = atof(arg[6]); + phigrad = atof(arg[7]); + thetagrad = atof(arg[8]); + } else if (strcmp(arg[4],"vector") == 0) { + if (narg != 8) error->all("Illegal fix gravity command"); + style = VECTOR; + xdir = atof(arg[5]); + ydir = atof(arg[6]); + zdir = atof(arg[7]); + } else error->all("Illegal fix gravity command"); + + double PI = 4.0*atan(1.0); + degree2rad = PI/180.0; + + if (style == CHUTE || style == SPHERICAL || style == GRADIENT) { + if (domain->dimension == 3) { + xgrav = sin(degree2rad * theta) * cos(degree2rad * phi); + ygrav = sin(degree2rad * theta) * sin(degree2rad * phi); + zgrav = cos(degree2rad * theta); + } else { + xgrav = sin(degree2rad * theta); + ygrav = cos(degree2rad * theta); + zgrav = 0.0; + } + } else if (style == VECTOR) { + if (domain->dimension == 3) { + double length = sqrt(xdir*xdir + ydir*ydir + zdir*zdir); + xgrav = xdir/length; + ygrav = ydir/length; + zgrav = zdir/length; + } else { + double length = sqrt(xdir*xdir + ydir*ydir); + xgrav = xdir/length; + ygrav = ydir/length; + zgrav = 0.0; + } + } + + time_origin = update->ntimestep; +} + +/* ---------------------------------------------------------------------- */ + +int FixGravityCuda::setmask() +{ + int mask = 0; + mask |= POST_FORCE_CUDA; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixGravityCuda::init() +{ + dt = update->dt; + + xacc = magnitude*xgrav; + yacc = magnitude*ygrav; + zacc = magnitude*zgrav; +} + +/* ---------------------------------------------------------------------- */ + +void FixGravityCuda::setup(int vflag) +{ + MYDBG( printf("# CUDA: FixGravityCuda::setup\n"); ) + + if (strcmp(update->integrate_style,"verlet") == 0) + { + Cuda_FixGravityCuda_Init(&cuda->shared_data); + cuda->cu_f->upload(); + post_force(vflag); + cuda->cu_f->download(); + + } + else { + } + MYDBG( printf("# CUDA: FixGravityCuda::setup done\n"); ) +} + +/* ---------------------------------------------------------------------- */ + +void FixGravityCuda::post_force(int vflag) +{ + // update direction of gravity vector if gradient style + + if (style == GRADIENT) { + if (domain->dimension == 3) { + double phi_current = degree2rad * + (phi + (update->ntimestep - time_origin)*dt*phigrad*360.0); + double theta_current = degree2rad * + (theta + (update->ntimestep - time_origin)*dt*thetagrad*360.0); + xgrav = sin(theta_current) * cos(phi_current); + ygrav = sin(theta_current) * sin(phi_current); + zgrav = cos(theta_current); + } else { + double theta_current = degree2rad * + (theta + (update->ntimestep - time_origin)*dt*thetagrad*360.0); + xgrav = sin(theta_current); + ygrav = cos(theta_current); + } + xacc = magnitude*xgrav; + yacc = magnitude*ygrav; + zacc = magnitude*zgrav; + } + + MYDBG( printf("# CUDA: FixGravityCuda::postforce start\n"); ) + Cuda_FixGravityCuda_PostForce(&cuda->shared_data, groupbit, xacc,yacc,zacc); +} + + diff --git a/src/USER-CUDA/fix_gravity_cuda.h b/src/USER-CUDA/fix_gravity_cuda.h new file mode 100644 index 0000000000..f4aef37790 --- /dev/null +++ b/src/USER-CUDA/fix_gravity_cuda.h @@ -0,0 +1,60 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(gravity/cuda,FixGravityCuda) + +#else + +#ifndef LMP_FIX_GRAVITY_CUDA_H +#define LMP_FIX_GRAVITY_CUDA_H + +#include "fix.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class FixGravityCuda : public Fix { + public: + FixGravityCuda(class LAMMPS *, int, char **); + int setmask(); + void init(); + void setup(int); + void post_force(int); + + private: + class Cuda *cuda; + int style; + double magnitude,dt; + double phi,theta,phigrad,thetagrad; + double xdir,ydir,zdir; + double xgrav,ygrav,zgrav,xacc,yacc,zacc; + double degree2rad; + int time_origin; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_nh_cuda.cpp b/src/USER-CUDA/fix_nh_cuda.cpp new file mode 100644 index 0000000000..b495850d0c --- /dev/null +++ b/src/USER-CUDA/fix_nh_cuda.cpp @@ -0,0 +1,2077 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mark Stevens (SNL), Aidan Thompson (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "fix_nh_cuda.h" +#include "atom.h" +#include "force.h" +#include "comm.h" +#include "modify.h" +#include "fix_deform.h" +#include "compute.h" +#include "kspace.h" +#include "update.h" +#include "respa.h" +#include "domain.h" +#include "memory.h" +#include "error.h" +#include "math_extra.h" +#include "cuda.h" +#include "fix_nh_cuda_cu.h" +#include "cuda_modify_flags.h" + +using namespace LAMMPS_NS; + +#define MIN(A,B) ((A) < (B)) ? (A) : (B) +#define MAX(A,B) ((A) > (B)) ? (A) : (B) + +enum{NOBIAS,BIAS}; +enum{NONE,XYZ,XY,YZ,XZ}; +enum{ISO,ANISO,TRICLINIC}; + +/* ---------------------------------------------------------------------- + NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion + ---------------------------------------------------------------------- */ + +FixNHCuda::FixNHCuda(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (narg < 4) error->all("Illegal fix nvt/npt/nph command"); + + restart_global = 1; + time_integrate = 1; + scalar_flag = 1; + vector_flag = 1; + global_freq = 1; + extscalar = 1; + extvector = 0; + + // default values + + pcouple = NONE; + drag = 0.0; + allremap = 1; + mtchain = mpchain = 3; + nc_tchain = nc_pchain = 1; + mtk_flag = 1; + deviatoric_flag = 0; + nreset_h0 = 0; + + // Used by FixNVTSllod to preserve non-default value + + mtchain_default_flag = 1; + + tstat_flag = 0; + double t_period = 0.0; + + double p_period[6]; + for (int i = 0; i < 6; i++) { + p_start[i] = p_stop[i] = p_period[i] = 0.0; + p_flag[i] = 0; + } + + // process keywords + + dimension = domain->dimension; + + int iarg = 3; + + while (iarg < narg) { + if (strcmp(arg[iarg],"temp") == 0) { + if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command"); + tstat_flag = 1; + t_start = atof(arg[iarg+1]); + t_stop = atof(arg[iarg+2]); + t_period = atof(arg[iarg+3]); + if (t_start < 0.0 || t_stop <= 0.0) + error->all("Target T for fix nvt/npt/nph cannot be 0.0"); + iarg += 4; + + } else if (strcmp(arg[iarg],"iso") == 0) { + if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command"); + pcouple = XYZ; + p_start[0] = p_start[1] = p_start[2] = atof(arg[iarg+1]); + p_stop[0] = p_stop[1] = p_stop[2] = atof(arg[iarg+2]); + p_period[0] = p_period[1] = p_period[2] = atof(arg[iarg+3]); + p_flag[0] = p_flag[1] = p_flag[2] = 1; + if (dimension == 2) { + p_start[2] = p_stop[2] = p_period[2] = 0.0; + p_flag[2] = 0; + } + iarg += 4; + } else if (strcmp(arg[iarg],"aniso") == 0) { + if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command"); + pcouple = NONE; + p_start[0] = p_start[1] = p_start[2] = atof(arg[iarg+1]); + p_stop[0] = p_stop[1] = p_stop[2] = atof(arg[iarg+2]); + p_period[0] = p_period[1] = p_period[2] = atof(arg[iarg+3]); + p_flag[0] = p_flag[1] = p_flag[2] = 1; + if (dimension == 2) { + p_start[2] = p_stop[2] = p_period[2] = 0.0; + p_flag[2] = 0; + } + iarg += 4; + } else if (strcmp(arg[iarg],"tri") == 0) { + if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command"); + pcouple = NONE; + p_start[0] = p_start[1] = p_start[2] = atof(arg[iarg+1]); + p_stop[0] = p_stop[1] = p_stop[2] = atof(arg[iarg+2]); + p_period[0] = p_period[1] = p_period[2] = atof(arg[iarg+3]); + p_flag[0] = p_flag[1] = p_flag[2] = 1; + p_start[3] = p_start[4] = p_start[5] = 0.0; + p_stop[3] = p_stop[4] = p_stop[5] = 0.0; + p_period[3] = p_period[4] = p_period[5] = atof(arg[iarg+3]); + p_flag[3] = p_flag[4] = p_flag[5] = 1; + if (dimension == 2) { + p_start[2] = p_stop[2] = p_period[2] = 0.0; + p_flag[2] = 0; + p_start[3] = p_stop[3] = p_period[3] = 0.0; + p_flag[3] = 0; + p_start[4] = p_stop[4] = p_period[4] = 0.0; + p_flag[4] = 0; + } + iarg += 4; + + } else if (strcmp(arg[iarg],"x") == 0) { + if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command"); + p_start[0] = atof(arg[iarg+1]); + p_stop[0] = atof(arg[iarg+2]); + p_period[0] = atof(arg[iarg+3]); + p_flag[0] = 1; + deviatoric_flag = 1; + iarg += 4; + } else if (strcmp(arg[iarg],"y") == 0) { + if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command"); + p_start[1] = atof(arg[iarg+1]); + p_stop[1] = atof(arg[iarg+2]); + p_period[1] = atof(arg[iarg+3]); + p_flag[1] = 1; + deviatoric_flag = 1; + iarg += 4; + } else if (strcmp(arg[iarg],"z") == 0) { + if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command"); + p_start[2] = atof(arg[iarg+1]); + p_stop[2] = atof(arg[iarg+2]); + p_period[2] = atof(arg[iarg+3]); + p_flag[2] = 1; + deviatoric_flag = 1; + iarg += 4; + if (dimension == 2) + error->all("Invalid fix nvt/npt/nph command for a 2d simulation"); + + } else if (strcmp(arg[iarg],"yz") == 0) { + if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command"); + p_start[3] = atof(arg[iarg+1]); + p_stop[3] = atof(arg[iarg+2]); + p_period[3] = atof(arg[iarg+3]); + p_flag[3] = 1; + deviatoric_flag = 1; + iarg += 4; + if (dimension == 2) + error->all("Invalid fix nvt/npt/nph command for a 2d simulation"); + } else if (strcmp(arg[iarg],"xz") == 0) { + if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command"); + p_start[4] = atof(arg[iarg+1]); + p_stop[4] = atof(arg[iarg+2]); + p_period[4] = atof(arg[iarg+3]); + p_flag[4] = 1; + deviatoric_flag = 1; + iarg += 4; + if (dimension == 2) + error->all("Invalid fix nvt/npt/nph command for a 2d simulation"); + } else if (strcmp(arg[iarg],"xy") == 0) { + if (iarg+4 > narg) error->all("Illegal fix nvt/npt/nph command"); + p_start[5] = atof(arg[iarg+1]); + p_stop[5] = atof(arg[iarg+2]); + p_period[5] = atof(arg[iarg+3]); + p_flag[5] = 1; + deviatoric_flag = 1; + iarg += 4; + + } else if (strcmp(arg[iarg],"couple") == 0) { + if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command"); + if (strcmp(arg[iarg+1],"xyz") == 0) pcouple = XYZ; + else if (strcmp(arg[iarg+1],"xy") == 0) pcouple = XY; + else if (strcmp(arg[iarg+1],"yz") == 0) pcouple = YZ; + else if (strcmp(arg[iarg+1],"xz") == 0) pcouple = XZ; + else if (strcmp(arg[iarg+1],"none") == 0) pcouple = NONE; + else error->all("Illegal fix nvt/npt/nph command"); + iarg += 2; + + } else if (strcmp(arg[iarg],"drag") == 0) { + if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command"); + drag = atof(arg[iarg+1]); + if (drag < 0.0) error->all("Illegal fix nvt/npt/nph command"); + iarg += 2; + } else if (strcmp(arg[iarg],"dilate") == 0) { + if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command"); + if (strcmp(arg[iarg+1],"all") == 0) allremap = 1; + else if (strcmp(arg[iarg+1],"partial") == 0) allremap = 0; + else error->all("Illegal fix nvt/npt/nph command"); + iarg += 2; + } else if (strcmp(arg[iarg],"tchain") == 0) { + if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command"); + mtchain = atoi(arg[iarg+1]); + if (mtchain < 1) error->all("Illegal fix nvt/npt/nph command"); + iarg += 2; + } else if (strcmp(arg[iarg],"pchain") == 0) { + if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command"); + mpchain = atoi(arg[iarg+1]); + if (mpchain < 0) error->all("Illegal fix nvt/npt/nph command"); + iarg += 2; + } else if (strcmp(arg[iarg],"mtk") == 0) { + if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command"); + if (strcmp(arg[iarg+1],"yes") == 0) mtk_flag = 1; + else if (strcmp(arg[iarg+1],"no") == 0) mtk_flag = 0; + else error->all("Illegal fix nvt/npt/nph command"); + iarg += 2; + } else if (strcmp(arg[iarg],"tloop") == 0) { + if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command"); + nc_tchain = atoi(arg[iarg+1]); + if (nc_tchain < 0) error->all("Illegal fix nvt/npt/nph command"); + iarg += 2; + } else if (strcmp(arg[iarg],"ploop") == 0) { + if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command"); + nc_pchain = atoi(arg[iarg+1]); + if (nc_pchain < 0) error->all("Illegal fix nvt/npt/nph command"); + iarg += 2; + } else if (strcmp(arg[iarg],"nreset") == 0) { + if (iarg+2 > narg) error->all("Illegal fix nvt/npt/nph command"); + nreset_h0 = atoi(arg[iarg+1]); + if (nreset_h0 < 0) error->all("Illegal fix nvt/npt/nph command"); + iarg += 2; + } else error->all("Illegal fix nvt/npt/nph command"); + } + + // error checks + + if (dimension == 2 && (p_flag[2] || p_flag[3] || p_flag[4])) + error->all("Invalid fix nvt/npt/nph command for a 2d simulation"); + if (dimension == 2 && (pcouple == YZ || pcouple == XZ)) + error->all("Invalid fix nvt/npt/nph command for a 2d simulation"); + + if (pcouple == XYZ && (p_flag[0] == 0 || p_flag[1] == 0)) + error->all("Invalid fix nvt/npt/nph command pressure settings"); + if (pcouple == XYZ && dimension == 3 && p_flag[2] == 0) + error->all("Invalid fix nvt/npt/nph command pressure settings"); + if (pcouple == XY && (p_flag[0] == 0 || p_flag[1] == 0)) + error->all("Invalid fix nvt/npt/nph command pressure settings"); + if (pcouple == YZ && (p_flag[1] == 0 || p_flag[2] == 0)) + error->all("Invalid fix nvt/npt/nph command pressure settings"); + if (pcouple == XZ && (p_flag[0] == 0 || p_flag[2] == 0)) + error->all("Invalid fix nvt/npt/nph command pressure settings"); + + if (p_flag[0] && domain->xperiodic == 0) + error->all("Cannot use fix nvt/npt/nph on a non-periodic dimension"); + if (p_flag[1] && domain->yperiodic == 0) + error->all("Cannot use fix nvt/npt/nph on a non-periodic dimension"); + if (p_flag[2] && domain->zperiodic == 0) + error->all("Cannot use fix nvt/npt/nph on a non-periodic dimension"); + if (p_flag[3] && domain->zperiodic == 0) + error->all("Cannot use fix nvt/npt/nph on a 2nd non-periodic dimension"); + if (p_flag[4] && domain->zperiodic == 0) + error->all("Cannot use fix nvt/npt/nph on a 2nd non-periodic dimension"); + if (p_flag[5] && domain->yperiodic == 0) + error->all("Cannot use fix nvt/npt/nph on a 2nd non-periodic dimension"); + + if (!domain->triclinic && (p_flag[3] || p_flag[4] || p_flag[5])) + error->all("Can not specify Pxy/Pxz/Pyz in " + "fix nvt/npt/nph with non-triclinic box"); + + if (pcouple == XYZ && dimension == 3 && + (p_start[0] != p_start[1] || p_start[0] != p_start[2] || + p_stop[0] != p_stop[1] || p_stop[0] != p_stop[2] || + p_period[0] != p_period[1] || p_period[0] != p_period[2])) + error->all("Invalid fix nvt/npt/nph pressure settings"); + if (pcouple == XYZ && dimension == 2 && + (p_start[0] != p_start[1] || p_stop[0] != p_stop[1] || + p_period[0] != p_period[1])) + error->all("Invalid fix nvt/npt/nph pressure settings"); + if (pcouple == XY && + (p_start[0] != p_start[1] || p_stop[0] != p_stop[1] || + p_period[0] != p_period[1])) + error->all("Invalid fix nvt/npt/nph pressure settings"); + if (pcouple == YZ && + (p_start[1] != p_start[2] || p_stop[1] != p_stop[2] || + p_period[1] != p_period[2])) + error->all("Invalid fix nvt/npt/nph pressure settings"); + if (pcouple == XZ && + (p_start[0] != p_start[2] || p_stop[0] != p_stop[2] || + p_period[0] != p_period[2])) + error->all("Invalid fix nvt/npt/nph pressure settings"); + + if ((tstat_flag && t_period <= 0.0) || + (p_flag[0] && p_period[0] <= 0.0) || + (p_flag[1] && p_period[1] <= 0.0) || + (p_flag[2] && p_period[2] <= 0.0) || + (p_flag[3] && p_period[3] <= 0.0) || + (p_flag[4] && p_period[4] <= 0.0) || + (p_flag[5] && p_period[5] <= 0.0)) + error->all("Fix nvt/npt/nph damping parameters must be > 0.0"); + + // set pstat_flag and box change variables + + pstat_flag = 0; + for (int i = 0; i < 6; i++) + if (p_flag[i]) pstat_flag = 1; + + if (pstat_flag) { + box_change = 1; + if (p_flag[0] || p_flag[1] || p_flag[2]) box_change_size = 1; + if (p_flag[3] || p_flag[4] || p_flag[5]) box_change_shape = 1; + no_change_box = 1; + if (allremap == 0) restart_pbc = 1; + } + + // pstyle = TRICLINIC if any off-diagonal term is controlled -> 6 dof + // else pstyle = ISO if XYZ coupling or XY coupling in 2d -> 1 dof + // else pstyle = ANISO -> 3 dof + + if (p_flag[3] || p_flag[4] || p_flag[5]) pstyle = TRICLINIC; + else if (pcouple == XYZ || (dimension == 2 && pcouple == XY)) pstyle = ISO; + else pstyle = ANISO; + + // convert input periods to frequencies + + t_freq = 0.0; + p_freq[0] = p_freq[1] = p_freq[2] = p_freq[3] = p_freq[4] = p_freq[5] = 0.0; + + if (tstat_flag) t_freq = 1.0 / t_period; + if (p_flag[0]) p_freq[0] = 1.0 / p_period[0]; + if (p_flag[1]) p_freq[1] = 1.0 / p_period[1]; + if (p_flag[2]) p_freq[2] = 1.0 / p_period[2]; + if (p_flag[3]) p_freq[3] = 1.0 / p_period[3]; + if (p_flag[4]) p_freq[4] = 1.0 / p_period[4]; + if (p_flag[5]) p_freq[5] = 1.0 / p_period[5]; + + // Nose/Hoover temp and pressure init + + size_vector = 0; + + if (tstat_flag) { + int ich; + eta = new double[mtchain]; + + // add one extra dummy thermostat, set to zero + + eta_dot = new double[mtchain+1]; + eta_dot[mtchain] = 0.0; + eta_dotdot = new double[mtchain]; + for (ich = 0; ich < mtchain; ich++) { + eta[ich] = eta_dot[ich] = eta_dotdot[ich] = 0.0; + } + eta_mass = new double[mtchain]; + size_vector += 2*2*mtchain; + } + + if (pstat_flag) { + omega[0] = omega[1] = omega[2] = 0.0; + omega_dot[0] = omega_dot[1] = omega_dot[2] = 0.0; + omega_mass[0] = omega_mass[1] = omega_mass[2] = 0.0; + omega[3] = omega[4] = omega[5] = 0.0; + omega_dot[3] = omega_dot[4] = omega_dot[5] = 0.0; + omega_mass[3] = omega_mass[4] = omega_mass[5] = 0.0; + if (pstyle == ISO) size_vector += 2*2*1; + else if (pstyle == ANISO) size_vector += 2*2*3; + else if (pstyle == TRICLINIC) size_vector += 2*2*6; + + if (mpchain) { + int ich; + etap = new double[mpchain]; + + // add one extra dummy thermostat, set to zero + + etap_dot = new double[mpchain+1]; + etap_dot[mpchain] = 0.0; + etap_dotdot = new double[mpchain]; + for (ich = 0; ich < mpchain; ich++) { + etap[ich] = etap_dot[ich] = + etap_dotdot[ich] = 0.0; + } + etap_mass = new double[mpchain]; + size_vector += 2*2*mpchain; + } + + if (deviatoric_flag) size_vector += 1; + } + + nrigid = 0; + rfix = NULL; + + // initialize vol0,t0 to zero to signal uninitialized + // values then assigned in init(), if necessary + + vol0 = t0 = 0.0; +} + +/* ---------------------------------------------------------------------- */ + +FixNHCuda::~FixNHCuda() +{ + delete [] rfix; + + // delete temperature and pressure if fix created them + + if (tflag) modify->delete_compute(id_temp); + delete [] id_temp; + + if (tstat_flag) { + delete [] eta; + delete [] eta_dot; + delete [] eta_dotdot; + delete [] eta_mass; + } + + if (pstat_flag) { + if (pflag) modify->delete_compute(id_press); + delete [] id_press; + if (mpchain) { + delete [] etap; + delete [] etap_dot; + delete [] etap_dotdot; + delete [] etap_mass; + } + } +} + +/* ---------------------------------------------------------------------- */ + +int FixNHCuda::setmask() +{ + int mask = 0; + mask |= INITIAL_INTEGRATE_CUDA; + mask |= FINAL_INTEGRATE_CUDA; + mask |= THERMO_ENERGY_CUDA; + //mask |= INITIAL_INTEGRATE_RESPA; + //mask |= FINAL_INTEGRATE_RESPA; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixNHCuda::init() +{ + // insure no conflict with fix deform + + if (pstat_flag) + for (int i = 0; i < modify->nfix; i++) + if (strcmp(modify->fix[i]->style,"deform") == 0) { + int *dimflag = ((FixDeform *) modify->fix[i])->dimflag; + if ((p_flag[0] && dimflag[0]) || (p_flag[1] && dimflag[1]) || + (p_flag[2] && dimflag[2]) || (p_flag[3] && dimflag[3]) || + (p_flag[4] && dimflag[4]) || (p_flag[5] && dimflag[5])) + error->all("Cannot use fix npt and fix deform on " + "same component of stress tensor"); + } + + // set temperature and pressure ptrs + + int icompute = modify->find_compute(id_temp); + if (icompute < 0) + error->all("Temperature ID for fix nvt/nph/npt does not exist"); + temperature = modify->compute[icompute]; + + if (temperature->tempbias) which = BIAS; + else which = NOBIAS; + + if (pstat_flag) { + icompute = modify->find_compute(id_press); + if (icompute < 0) error->all("Pressure ID for fix npt/nph does not exist"); + pressure = modify->compute[icompute]; + } + + // set timesteps and frequencies + + dtv = update->dt; + dtf = 0.5 * update->dt * force->ftm2v; + dthalf = 0.5 * update->dt; + dt4 = 0.25 * update->dt; + dt8 = 0.125 * update->dt; + dto = dthalf; + + p_freq_max = 0.0; + if (pstat_flag) { + p_freq_max = MAX(p_freq[0],p_freq[1]); + p_freq_max = MAX(p_freq_max,p_freq[2]); + if (pstyle == TRICLINIC) { + p_freq_max = MAX(p_freq_max,p_freq[3]); + p_freq_max = MAX(p_freq_max,p_freq[4]); + p_freq_max = MAX(p_freq_max,p_freq[5]); + } + pdrag_factor = 1.0 - (update->dt * p_freq_max * drag / nc_pchain); + } + + if (tstat_flag) + tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain); + + // tally the number of dimensions that are barostatted + // also compute the initial volume and reference cell + // set initial volume and reference cell, if not already done + + if (pstat_flag) { + pdim = p_flag[0] + p_flag[1] + p_flag[2]; + if (vol0 == 0.0) { + if (dimension == 3) vol0 = domain->xprd * domain->yprd * domain->zprd; + else vol0 = domain->xprd * domain->yprd; + h0_inv[0] = domain->h_inv[0]; + h0_inv[1] = domain->h_inv[1]; + h0_inv[2] = domain->h_inv[2]; + h0_inv[3] = domain->h_inv[3]; + h0_inv[4] = domain->h_inv[4]; + h0_inv[5] = domain->h_inv[5]; + } + } + + boltz = force->boltz; + nktv2p = force->nktv2p; + + if (force->kspace) kspace_flag = 1; + else kspace_flag = 0; + + if (strcmp(update->integrate_style,"respa") == 0) { + nlevels_respa = ((Respa *) update->integrate)->nlevels; + step_respa = ((Respa *) update->integrate)->step; + dto = 0.5*step_respa[0]; + } + + // detect if any rigid fixes exist so rigid bodies move when box is remapped + // rfix[] = indices to each fix rigid + + delete [] rfix; + nrigid = 0; + rfix = NULL; + + for (int i = 0; i < modify->nfix; i++) + if (modify->fix[i]->rigid_flag) nrigid++; + if (nrigid) { + rfix = new int[nrigid]; + nrigid = 0; + for (int i = 0; i < modify->nfix; i++) + if (modify->fix[i]->rigid_flag) rfix[nrigid++] = i; + } + triggerneighsq= cuda->shared_data.atom.triggerneighsq; + cuda->neighbor_decide_by_integrator=1; + Cuda_FixNHCuda_Init(&cuda->shared_data,dtv,dtf); + +} + +/* ---------------------------------------------------------------------- + compute T,P before integrator starts +------------------------------------------------------------------------- */ + +void FixNHCuda::setup(int vflag) +{ + // initialize some quantities that were not available earlier + + //if (mtk_flag) mtk_factor = 1.0 + 1.0/atom->natoms; + //else mtk_factor = 1.0; + tdof = temperature->dof; + + // t_target is used by compute_scalar(), even for NPH + + if (tstat_flag) t_target = t_start; + else if (pstat_flag) { + + // t0 = initial value for piston mass and energy conservation + // cannot be done in init() b/c temperature cannot be called there + // is b/c Modify::init() inits computes after fixes due to dof dependence + // guesstimate a unit-dependent t0 if actual T = 0.0 + // if it was read in from a restart file, leave it be + + if (t0 == 0.0) { + t0 = temperature->compute_scalar(); + if (t0 == 0.0) { + if (strcmp(update->unit_style,"lj") == 0) t0 = 1.0; + else t0 = 300.0; + } + } + t_target = t0; + } + + if (pstat_flag) compute_press_target(); + + t_current = temperature->compute_scalar(); + if (pstat_flag) { + if (pstyle == ISO) double tmp = pressure->compute_scalar(); + else pressure->compute_vector(); + couple(); + pressure->addstep(update->ntimestep+1); + } + + // initial forces on thermostat variables + + if (tstat_flag) { + eta_mass[0] = tdof * boltz * t_target / (t_freq*t_freq); + for (int ich = 1; ich < mtchain; ich++) + eta_mass[ich] = boltz * t_target / (t_freq*t_freq); + for (int ich = 1; ich < mtchain; ich++) { + eta_dotdot[ich] = (eta_mass[ich-1]*eta_dot[ich-1]*eta_dot[ich-1] - + boltz*t_target) / eta_mass[ich]; + } + } + + if (pstat_flag) { + double kt = boltz * t_target; + double nkt = atom->natoms * kt; + + for (int i = 0; i < 3; i++) + if (p_flag[i]) + omega_mass[i] = nkt/(p_freq[i]*p_freq[i]); + + if (pstyle == TRICLINIC) { + for (int i = 3; i < 6; i++) + if (p_flag[i]) omega_mass[i] = nkt/(p_freq[i]*p_freq[i]); + } + + // initial forces on barostat thermostat variables + + if (mpchain) { + etap_mass[0] = boltz * t_target / (p_freq_max*p_freq_max); + for (int ich = 1; ich < mpchain; ich++) + etap_mass[ich] = boltz * t_target / (p_freq_max*p_freq_max); + for (int ich = 1; ich < mpchain; ich++) + etap_dotdot[ich] = + (etap_mass[ich-1]*etap_dot[ich-1]*etap_dot[ich-1] - + boltz*t_target) / etap_mass[ich]; + } + + // compute appropriately coupled elements of mvv_current + + //if (mtk_flag) couple_ke(); + } +} + +/* ---------------------------------------------------------------------- + 1st half of Verlet update +------------------------------------------------------------------------- */ + +void FixNHCuda::initial_integrate(int vflag) +{ + if(!temperature->cudable) cuda->downloadAll(); + + if(triggerneighsq!=cuda->shared_data.atom.triggerneighsq) + { + triggerneighsq= cuda->shared_data.atom.triggerneighsq; + Cuda_FixNHCuda_Init(&cuda->shared_data,dtv,dtf); + } + + // update eta_press_dot + + if (pstat_flag && mpchain) nhc_press_integrate(); + + // update eta_dot + + if (tstat_flag) { + double delta = update->ntimestep - update->beginstep; + delta /= update->endstep - update->beginstep; + t_target = t_start + delta * (t_stop-t_start); + eta_mass[0] = tdof * boltz * t_target / (t_freq*t_freq); + for (int ich = 1; ich < mtchain; ich++) + eta_mass[ich] = boltz * t_target / (t_freq*t_freq); + nhc_temp_integrate(); + } + + // need to recompute pressure to account for change in KE + // t_current is up-to-date, but compute_temperature is not + // compute appropriately coupled elements of mvv_current + + if (pstat_flag) { + if (pstyle == ISO) { + temperature->compute_scalar(); + double tmp = pressure->compute_scalar(); + } else { + temperature->compute_vector(); + pressure->compute_vector(); + } + couple(); + pressure->addstep(update->ntimestep+1); + //if (mtk_flag) couple_ke(); + } + + if(which==NOBIAS) + { + if (pstat_flag) { + compute_press_target(); + nh_omega_dot(); + factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2)); + factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2)); + factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2)); + Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(&cuda->shared_data, groupbit, factor,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal,(pstyle == TRICLINIC)?1:0); + } + else + Cuda_FixNHCuda_nve_v(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal); + } + else if(which==BIAS) + { + if(pstat_flag) + { + compute_press_target(); + nh_omega_dot(); + factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2)); + factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2)); + factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2)); + if(!temperature->cudable) + { + nh_v_press(); + cuda->cu_v->upload(); + } + else + { + int groupbit_org=temperature->groupbit; + temperature->groupbit=groupbit; + temperature->remove_bias_all(); + Cuda_FixNHCuda_nh_v_press(&cuda->shared_data, groupbit, factor,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal,(pstyle == TRICLINIC)?1:0); + temperature->restore_bias_all(); + temperature->groupbit=groupbit_org; + } + } + Cuda_FixNHCuda_nve_v(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal); + } + + // remap simulation box by 1/2 step + + if (pstat_flag) remap(); + + Cuda_FixNHCuda_nve_x(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal); + + // remap simulation box by 1/2 step + // redo KSpace coeffs since volume has changed + + if (pstat_flag) { + remap(); + if (kspace_flag) force->kspace->setup(); + } +} + +/* ---------------------------------------------------------------------- + 2nd half of Verlet update +------------------------------------------------------------------------- */ + +void FixNHCuda::final_integrate() +{ + if(!temperature->cudable) cuda->downloadAll(); + + if(which==NOBIAS) + { + if(pstat_flag) + { + factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2)); + factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2)); + factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2)); + + Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(&cuda->shared_data, groupbit, factor,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal,(pstyle == TRICLINIC)?1:0); + } + else + Cuda_FixNHCuda_nve_v(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal); + } + else if(which==BIAS) + { + Cuda_FixNHCuda_nve_v(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal); + + if(pstat_flag) + { + factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2)); + factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2)); + factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2)); + if(!temperature->cudable) + { + cuda->cu_v->download(); + nh_v_press(); + cuda->cu_v->upload(); + } + else + { + int groupbit_org=temperature->groupbit; + temperature->groupbit=groupbit; + temperature->remove_bias_all(); + Cuda_FixNHCuda_nh_v_press(&cuda->shared_data, groupbit, factor,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal,(pstyle == TRICLINIC)?1:0); + temperature->restore_bias_all(); + temperature->groupbit=groupbit_org; + } + } + } + // compute new T,P + // compute appropriately coupled elements of mvv_current + + if(!temperature->cudable) cuda->cu_v->download(); + t_current = temperature->compute_scalar(); + if (pstat_flag) { + if (pstyle == ISO) double tmp = pressure->compute_scalar(); + else pressure->compute_vector(); + couple(); + pressure->addstep(update->ntimestep+1); + } + + if (pstat_flag) nh_omega_dot(); + + // update eta_dot + // update eta_press_dot + + if (tstat_flag) nhc_temp_integrate(); + if (pstat_flag && mpchain) nhc_press_integrate(); +} + +/* ---------------------------------------------------------------------- */ + +void FixNHCuda::initial_integrate_respa(int vflag, int ilevel, int iloop) +{ + int i; + + // set timesteps by level + + dtv = step_respa[ilevel]; + dtf = 0.5 * step_respa[ilevel] * force->ftm2v; + dthalf = 0.5 * step_respa[ilevel]; + + // outermost level - update eta_dot and omega_dot, apply to v, remap box + // all other levels - NVE update of v + // x,v updates only performed for atoms in group + + if (ilevel == nlevels_respa-1) { + + // update eta_press_dot + + if (pstat_flag && mpchain) nhc_press_integrate(); + + // update eta_dot + + if (tstat_flag) { + double delta = update->ntimestep - update->beginstep; + delta /= update->endstep - update->beginstep; + t_target = t_start + delta * (t_stop-t_start); + eta_mass[0] = tdof * boltz * t_target / (t_freq*t_freq); + for (int ich = 1; ich < mtchain; ich++) + eta_mass[ich] = boltz * t_target / (t_freq*t_freq); + nhc_temp_integrate(); + } + + // recompute pressure to account for change in KE + // t_current is up-to-date, but compute_temperature is not + // compute appropriately coupled elements of mvv_current + + if (pstat_flag) { + if (pstyle == ISO) { + temperature->compute_scalar(); + double tmp = pressure->compute_scalar(); + } else { + temperature->compute_vector(); + pressure->compute_vector(); + } + couple(); + pressure->addstep(update->ntimestep+1); + if (mtk_flag) couple_ke(); + } + + if (pstat_flag) { + compute_press_target(); + nh_omega_dot(); + nh_v_press(); + } + + nve_v(); + + } else nve_v(); + + // innermost level - also update x only for atoms in group + // if barostat, perform 1/2 step remap before and after + + if (ilevel == 0) { + if (pstat_flag) remap(); + nve_x(); + if (pstat_flag) remap(); + } + + // if barostat, redo KSpace coeffs at outermost level, + // since volume has changed + + if (ilevel == nlevels_respa-1 && kspace_flag && pstat_flag) + force->kspace->setup(); +} + +/* ---------------------------------------------------------------------- */ + +void FixNHCuda::final_integrate_respa(int ilevel, int iloop) +{ + // set timesteps by level + + dtf = 0.5 * step_respa[ilevel] * force->ftm2v; + dthalf = 0.5 * step_respa[ilevel]; + + // outermost level - update eta_dot and omega_dot, apply via final_integrate + // all other levels - NVE update of v + + if (ilevel == nlevels_respa-1) final_integrate(); + else nve_v(); +} + +/* ---------------------------------------------------------------------- */ + +void FixNHCuda::couple() +{ + double *tensor = pressure->vector; + + if (pstyle == ISO) + p_current[0] = p_current[1] = p_current[2] = pressure->scalar; + else if (pcouple == XYZ) { + double ave = 1.0/3.0 * (tensor[0] + tensor[1] + tensor[2]); + p_current[0] = p_current[1] = p_current[2] = ave; + } else if (pcouple == XY) { + double ave = 0.5 * (tensor[0] + tensor[1]); + p_current[0] = p_current[1] = ave; + p_current[2] = tensor[2]; + } else if (pcouple == YZ) { + double ave = 0.5 * (tensor[1] + tensor[2]); + p_current[1] = p_current[2] = ave; + p_current[0] = tensor[0]; + } else if (pcouple == XZ) { + double ave = 0.5 * (tensor[0] + tensor[2]); + p_current[0] = p_current[2] = ave; + p_current[1] = tensor[1]; + } else { + p_current[0] = tensor[0]; + p_current[1] = tensor[1]; + p_current[2] = tensor[2]; + } + + // switch order from xy-xz-yz to Voigt + + if (pstyle == TRICLINIC) { + p_current[3] = tensor[5]; + p_current[4] = tensor[4]; + p_current[5] = tensor[3]; + } +} + +/* ---------------------------------------------------------------------- */ + +void FixNHCuda::couple_ke() +{ + double *tensor = temperature->vector; + if (pstyle == ISO) + mvv_current[0] = mvv_current[1] = mvv_current[2] = + tdof * boltz * t_current/dimension; + else if (pcouple == XYZ) { + double ave = 1.0/3.0 * (tensor[0] + tensor[1] + tensor[2]); + mvv_current[0] = mvv_current[1] = mvv_current[2] = ave; + } else if (pcouple == XY) { + double ave = 0.5 * (tensor[0] + tensor[1]); + mvv_current[0] = mvv_current[1] = ave; + mvv_current[2] = tensor[2]; + } else if (pcouple == YZ) { + double ave = 0.5 * (tensor[1] + tensor[2]); + mvv_current[1] = mvv_current[2] = ave; + mvv_current[0] = tensor[0]; + } else if (pcouple == XZ) { + double ave = 0.5 * (tensor[0] + tensor[2]); + mvv_current[0] = mvv_current[2] = ave; + mvv_current[1] = tensor[1]; + } else { + mvv_current[0] = tensor[0]; + mvv_current[1] = tensor[1]; + mvv_current[2] = tensor[2]; + } +} + +/* ---------------------------------------------------------------------- + change box size + remap all atoms or fix group atoms depending on allremap flag + if rigid bodies exist, scale rigid body centers-of-mass +------------------------------------------------------------------------- */ + +void FixNHCuda::remap() +{ + int i; + double oldlo,oldhi,ctr; + + double **x = atom->x; + int *mask = atom->mask; + int nlocal = atom->nlocal; + double *h = domain->h; + + // omega is not used, except for book-keeping + + for (int i = 0; i < 6; i++) omega[i] += dto*omega_dot[i]; + + // convert pertinent atoms and rigid bodies to lamda coords + if (allremap) domain->x2lamda(nlocal); + else { + for (i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + domain->x2lamda(x[i],x[i]); + } + + if (nrigid) + for (i = 0; i < nrigid; i++) + modify->fix[rfix[i]]->deform(0); + + // reset global and local box to new size/shape + + // This operation corresponds to applying the + // translate and scale operations + // corresponding to the solution of the following ODE: + // + // h_dot = omega_dot * h + // + // where h_dot, omega_dot and h are all upper-triangular + // 3x3 tensors. In Voigt notation, the elements of the + // RHS product tensor are: + // h_dot = [0*0, 1*1, 2*2, 1*3+3*2, 0*4+5*3+4*2, 0*5+5*1] + // + // Ordering of operations preserves time symmetry. + + double dto2 = dto/2.0; + double dto4 = dto/4.0; + double dto8 = dto/8.0; + + if (pstyle == TRICLINIC) { + + h[4] *= exp(dto8*omega_dot[0]); + h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]); + h[4] *= exp(dto8*omega_dot[0]); + + h[3] *= exp(dto4*omega_dot[1]); + h[3] += dto2*(omega_dot[3]*h[2]); + h[3] *= exp(dto4*omega_dot[1]); + + h[5] *= exp(dto4*omega_dot[0]); + h[5] += dto2*(omega_dot[5]*h[1]); + h[5] *= exp(dto4*omega_dot[0]); + + h[4] *= exp(dto8*omega_dot[0]); + h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]); + h[4] *= exp(dto8*omega_dot[0]); + + } + + for (i = 0; i < 3; i++) { + if (p_flag[i]) { + oldlo = domain->boxlo[i]; + oldhi = domain->boxhi[i]; + ctr = 0.5 * (oldlo + oldhi); + domain->boxlo[i] = (oldlo-ctr)*exp(dto*omega_dot[i]) + ctr; + domain->boxhi[i] = (oldhi-ctr)*exp(dto*omega_dot[i]) + ctr; + } + } + + if (pstyle == TRICLINIC) { + + h[4] *= exp(dto8*omega_dot[0]); + h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]); + h[4] *= exp(dto8*omega_dot[0]); + + h[3] *= exp(dto4*omega_dot[1]); + h[3] += dto2*(omega_dot[3]*h[2]); + h[3] *= exp(dto4*omega_dot[1]); + + h[5] *= exp(dto4*omega_dot[0]); + h[5] += dto2*(omega_dot[5]*h[1]); + h[5] *= exp(dto4*omega_dot[0]); + + h[4] *= exp(dto8*omega_dot[0]); + h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]); + h[4] *= exp(dto8*omega_dot[0]); + + domain->yz = h[3]; + domain->xz = h[4]; + domain->xy = h[5]; + + if (domain->yz < -0.5*domain->yprd || domain->yz > 0.5*domain->yprd || + domain->xz < -0.5*domain->xprd || domain->xz > 0.5*domain->xprd || + domain->xy < -0.5*domain->xprd || domain->xy > 0.5*domain->xprd) + error->all("Fix npt/nph has tilted box too far - " + "box flips are not yet implemented"); + } + + domain->set_global_box(); + domain->set_local_box(); + + // convert pertinent atoms and rigid bodies back to box coords + + if (allremap) domain->lamda2x(nlocal); + else { + for (i = 0; i < nlocal; i++) + if (mask[i] & groupbit) + domain->lamda2x(x[i],x[i]); + } + + if (nrigid) + for (i = 0; i < nrigid; i++) + modify->fix[rfix[i]]->deform(1); +} + +/* ---------------------------------------------------------------------- + pack entire state of Fix into one write +------------------------------------------------------------------------- */ + +void FixNHCuda::write_restart(FILE *fp) +{ + int nsize = 2; + if (tstat_flag) nsize += 1 + 2*mtchain; + if (pstat_flag) { + nsize += 16 + 2*mpchain; + if (deviatoric_flag) nsize += 6; + } + + double* list = (double *) memory->smalloc(nsize*sizeof(double),"nh:list"); + + int n = 0; + + list[n++] = tstat_flag; + if (tstat_flag) { + list[n++] = mtchain; + for (int ich = 0; ich < mtchain; ich++) + list[n++] = eta[ich]; + for (int ich = 0; ich < mtchain; ich++) + list[n++] = eta_dot[ich]; + } + + list[n++] = pstat_flag; + if (pstat_flag) { + list[n++] = omega[0]; + list[n++] = omega[1]; + list[n++] = omega[2]; + list[n++] = omega[3]; + list[n++] = omega[4]; + list[n++] = omega[5]; + list[n++] = omega_dot[0]; + list[n++] = omega_dot[1]; + list[n++] = omega_dot[2]; + list[n++] = omega_dot[3]; + list[n++] = omega_dot[4]; + list[n++] = omega_dot[5]; + list[n++] = vol0; + list[n++] = t0; + list[n++] = mpchain; + if (mpchain) { + for (int ich = 0; ich < mpchain; ich++) + list[n++] = etap[ich]; + for (int ich = 0; ich < mpchain; ich++) + list[n++] = etap_dot[ich]; + } + + list[n++] = deviatoric_flag; + if (deviatoric_flag) { + list[n++] = h0_inv[0]; + list[n++] = h0_inv[1]; + list[n++] = h0_inv[2]; + list[n++] = h0_inv[3]; + list[n++] = h0_inv[4]; + list[n++] = h0_inv[5]; + } + } + + if (comm->me == 0) { + int size = nsize * sizeof(double); + fwrite(&size,sizeof(int),1,fp); + fwrite(list,sizeof(double),nsize,fp); + } + + memory->sfree(list); +} + +/* ---------------------------------------------------------------------- + use state info from restart file to restart the Fix +------------------------------------------------------------------------- */ + +void FixNHCuda::restart(char *buf) +{ + int n = 0; + double *list = (double *) buf; + int flag = static_cast (list[n++]); + if (flag) { + int m = static_cast (list[n++]); + if (tstat_flag && m == mtchain) { + for (int ich = 0; ich < mtchain; ich++) + eta[ich] = list[n++]; + for (int ich = 0; ich < mtchain; ich++) + eta_dot[ich] = list[n++]; + } else n += 2*m; + } + flag = static_cast (list[n++]); + if (flag) { + omega[0] = list[n++]; + omega[1] = list[n++]; + omega[2] = list[n++]; + omega[3] = list[n++]; + omega[4] = list[n++]; + omega[5] = list[n++]; + omega_dot[0] = list[n++]; + omega_dot[1] = list[n++]; + omega_dot[2] = list[n++]; + omega_dot[3] = list[n++]; + omega_dot[4] = list[n++]; + omega_dot[5] = list[n++]; + vol0 = list[n++]; + t0 = list[n++]; + int m = static_cast (list[n++]); + if (pstat_flag && m == mpchain) { + for (int ich = 0; ich < mpchain; ich++) + etap[ich] = list[n++]; + for (int ich = 0; ich < mpchain; ich++) + etap_dot[ich] = list[n++]; + } else n+=2*m; + flag = static_cast (list[n++]); + if (flag) { + h0_inv[0] = list[n++]; + h0_inv[1] = list[n++]; + h0_inv[2] = list[n++]; + h0_inv[3] = list[n++]; + h0_inv[4] = list[n++]; + h0_inv[5] = list[n++]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +int FixNHCuda::modify_param(int narg, char **arg) +{ + if (strcmp(arg[0],"temp") == 0) { + if (narg < 2) error->all("Illegal fix_modify command"); + if (tflag) { + modify->delete_compute(id_temp); + tflag = 0; + } + delete [] id_temp; + int n = strlen(arg[1]) + 1; + id_temp = new char[n]; + strcpy(id_temp,arg[1]); + + int icompute = modify->find_compute(arg[1]); + if (icompute < 0) error->all("Could not find fix_modify temperature ID"); + temperature = modify->compute[icompute]; + + if (temperature->tempflag == 0) + error->all("Fix_modify temperature ID does not compute temperature"); + if (temperature->igroup != 0 && comm->me == 0) + error->warning("Temperature for fix modify is not for group all"); + + // reset id_temp of pressure to new temperature ID + + if (pstat_flag) { + icompute = modify->find_compute(id_press); + if (icompute < 0) + error->all("Pressure ID for fix modify does not exist"); + modify->compute[icompute]->reset_extra_compute_fix(id_temp); + } + + return 2; + + } else if (strcmp(arg[0],"press") == 0) { + if (narg < 2) error->all("Illegal fix_modify command"); + if (!pstat_flag) error->all("Illegal fix_modify command"); + if (pflag) { + modify->delete_compute(id_press); + pflag = 0; + } + delete [] id_press; + int n = strlen(arg[1]) + 1; + id_press = new char[n]; + strcpy(id_press,arg[1]); + + int icompute = modify->find_compute(arg[1]); + if (icompute < 0) error->all("Could not find fix_modify pressure ID"); + pressure = modify->compute[icompute]; + + if (pressure->pressflag == 0) + error->all("Fix_modify pressure ID does not compute pressure"); + return 2; + } + + return 0; +} + +/* ---------------------------------------------------------------------- */ + +double FixNHCuda::compute_scalar() +{ + int i; + double volume; + double energy; + double kt = boltz * t_target; + double lkt = tdof * kt; + double lkt_press = kt; + int ich; + if (dimension == 3) volume = domain->xprd * domain->yprd * domain->zprd; + else volume = domain->xprd * domain->yprd; + + energy = 0.0; + + // thermostat chain energy is equivalent to Eq. (2) in + // Martyna, Tuckerman, Tobias, Klein, Mol Phys, 87, 1117 + // Sum(0.5*p_eta_k^2/Q_k,k=1,M) + L*k*T*eta_1 + Sum(k*T*eta_k,k=2,M), + // where L = tdof + // M = mtchain + // p_eta_k = Q_k*eta_dot[k-1] + // Q_1 = L*k*T/t_freq^2 + // Q_k = k*T/t_freq^2, k > 1 + + if (tstat_flag) { + energy += lkt * eta[0] + 0.5*eta_mass[0]*eta_dot[0]*eta_dot[0]; + for (ich = 1; ich < mtchain; ich++) + energy += kt * eta[ich] + 0.5*eta_mass[ich]*eta_dot[ich]*eta_dot[ich]; + } + + // barostat energy is equivalent to Eq. (8) in + // Martyna, Tuckerman, Tobias, Klein, Mol Phys, 87, 1117 + // Sum(0.5*p_omega^2/W + P*V), + // where N = natoms + // p_omega = W*omega_dot + // W = N*k*T/p_freq^2 + // sum is over barostatted dimensions + + if (pstat_flag) { + for (i = 0; i < 3; i++) + if (p_flag[i]) + energy += 0.5*omega_dot[i]*omega_dot[i]*omega_mass[i] + + p_hydro*(volume-vol0) / (pdim*nktv2p); + + if (pstyle == TRICLINIC) { + for (i = 3; i < 6; i++) + if (p_flag[i]) + energy += 0.5*omega_dot[i]*omega_dot[i]*omega_mass[i]; + } + + // extra contributions from thermostat chain for barostat + + if (mpchain) { + energy += lkt_press * etap[0] + 0.5*etap_mass[0]*etap_dot[0]*etap_dot[0]; + for (ich = 1; ich < mpchain; ich++) + energy += kt * etap[ich] + + 0.5*etap_mass[ich]*etap_dot[ich]*etap_dot[ich]; + } + + // extra contribution from strain energy + + if (deviatoric_flag) energy += compute_strain_energy(); + } + + return energy; +} + +/* ---------------------------------------------------------------------- + return a single element of the following vectors, in this order: + eta[tchain], eta_dot[tchain], omega[ndof], omega_dot[ndof] + etap[pchain], etap_dot[pchain], PE_eta[tchain], KE_eta_dot[tchain] + PE_omega[ndof], KE_omega_dot[ndof], PE_etap[pchain], KE_etap_dot[pchain] + PE_strain[1] + if no thermostat exists, related quantities are omitted from the list + if no barostat exists, related quantities are omitted from the list + ndof = 1,3,6 degrees of freedom for pstyle = ISO,ANISO,TRI +------------------------------------------------------------------------- */ + +double FixNHCuda::compute_vector(int n) +{ + int ilen; + + if (tstat_flag) { + ilen = mtchain; + if (n < ilen) return eta[n]; + n -= ilen; + ilen = mtchain; + if (n < ilen) return eta_dot[n]; + n -= ilen; + } + + if (pstat_flag) { + if (pstyle == ISO) { + ilen = 1; + if (n < ilen) return omega[n]; + n -= ilen; + } else if (pstyle == ANISO) { + ilen = 3; + if (n < ilen) return omega[n]; + n -= ilen; + } else { + ilen = 6; + if (n < ilen) return omega[n]; + n -= ilen; + } + + if (pstyle == ISO) { + ilen = 1; + if (n < ilen) return omega_dot[n]; + n -= ilen; + } else if (pstyle == ANISO) { + ilen = 3; + if (n < ilen) return omega_dot[n]; + n -= ilen; + } else { + ilen = 6; + if (n < ilen) return omega_dot[n]; + n -= ilen; + } + + if (mpchain) { + ilen = mpchain; + if (n < ilen) return etap[n]; + n -= ilen; + ilen = mpchain; + if (n < ilen) return etap_dot[n]; + n -= ilen; + } + } + + int i; + double volume; + double kt = boltz * t_target; + double lkt = tdof * kt; + double lkt_press = kt; + int ich; + if (dimension == 3) volume = domain->xprd * domain->yprd * domain->zprd; + else volume = domain->xprd * domain->yprd; + + if (tstat_flag) { + ilen = mtchain; + if (n < ilen) { + ich = n; + if (ich == 0) + return lkt * eta[0]; + else + return kt * eta[ich]; + } + n -= ilen; + ilen = mtchain; + if (n < ilen) { + ich = n; + if (ich == 0) + return 0.5*eta_mass[0]*eta_dot[0]*eta_dot[0]; + else + return 0.5*eta_mass[ich]*eta_dot[ich]*eta_dot[ich]; + } + n -= ilen; + } + + if (pstat_flag) { + if (pstyle == ISO) { + ilen = 1; + if (n < ilen) + return p_hydro*(volume-vol0) / nktv2p; + n -= ilen; + } else if (pstyle == ANISO) { + ilen = 3; + if (n < ilen) + if (p_flag[n]) + return p_hydro*(volume-vol0) / (pdim*nktv2p); + else + return 0.0; + n -= ilen; + } else { + ilen = 6; + if (n < ilen) + if (n > 2) return 0.0; + else if (p_flag[n]) + return p_hydro*(volume-vol0) / (pdim*nktv2p); + else + return 0.0; + n -= ilen; + } + + if (pstyle == ISO) { + ilen = 1; + if (n < ilen) + return pdim*0.5*omega_dot[n]*omega_dot[n]*omega_mass[n]; + n -= ilen; + } else if (pstyle == ANISO) { + ilen = 3; + if (n < ilen) + if (p_flag[n]) + return 0.5*omega_dot[n]*omega_dot[n]*omega_mass[n]; + else return 0.0; + n -= ilen; + } else { + ilen = 6; + if (n < ilen) + if (p_flag[n]) + return 0.5*omega_dot[n]*omega_dot[n]*omega_mass[n]; + else return 0.0; + n -= ilen; + } + + if (mpchain) { + ilen = mpchain; + if (n < ilen) { + ich = n; + if (ich == 0) return lkt_press * etap[0]; + else return kt * etap[ich]; + } + n -= ilen; + ilen = mpchain; + if (n < ilen) { + ich = n; + if (ich == 0) + return 0.5*etap_mass[0]*etap_dot[0]*etap_dot[0]; + else + return 0.5*etap_mass[ich]*etap_dot[ich]*etap_dot[ich]; + } + n -= ilen; + } + + if (deviatoric_flag) { + ilen = 1; + if (n < ilen) + return compute_strain_energy(); + n -= ilen; + } + } + + return 0.0; +} + +/* ---------------------------------------------------------------------- */ + +void FixNHCuda::reset_dt() +{ + dtv = update->dt; + dtf = 0.5 * update->dt * force->ftm2v; + dthalf = 0.5 * update->dt; + dt4 = 0.25 * update->dt; + dt8 = 0.125 * update->dt; + dto = dthalf; + + // If using respa, then remap is performed in innermost level + + if (strcmp(update->integrate_style,"respa") == 0) + dto = 0.5*step_respa[0]; + + p_freq_max = 0.0; + if (pstat_flag) { + p_freq_max = MAX(p_freq[0],p_freq[1]); + p_freq_max = MAX(p_freq_max,p_freq[2]); + if (pstyle == TRICLINIC) { + p_freq_max = MAX(p_freq_max,p_freq[3]); + p_freq_max = MAX(p_freq_max,p_freq[4]); + p_freq_max = MAX(p_freq_max,p_freq[5]); + } + pdrag_factor = 1.0 - (update->dt * p_freq_max * drag / nc_pchain); + } + + if (tstat_flag) + tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain); +} + +/* ---------------------------------------------------------------------- + perform half-step update of chain thermostat variables +------------------------------------------------------------------------- */ + +void FixNHCuda::nhc_temp_integrate() +{ + int ich; + double expfac; + + double lkt = tdof * boltz * t_target; + double kecurrent = tdof * boltz * t_current; + eta_dotdot[0] = (kecurrent - lkt)/eta_mass[0]; + + double ncfac = 1.0/nc_tchain; + for (int iloop = 0; iloop < nc_tchain; iloop++) { + + for (ich = mtchain-1; ich > 0; ich--) { + expfac = exp(-ncfac*dt8*eta_dot[ich+1]); + eta_dot[ich] *= expfac; + eta_dot[ich] += eta_dotdot[ich] * ncfac*dt4; + eta_dot[ich] *= tdrag_factor; + eta_dot[ich] *= expfac; + } + + expfac = exp(-ncfac*dt8*eta_dot[1]); + eta_dot[0] *= expfac; + eta_dot[0] += eta_dotdot[0] * ncfac*dt4; + eta_dot[0] *= tdrag_factor; + eta_dot[0] *= expfac; + + factor_eta = exp(-ncfac*dthalf*eta_dot[0]); + if(which==NOBIAS) + Cuda_FixNHCuda_nh_v_temp(&cuda->shared_data,groupbit,factor_eta,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal); + else if(which==BIAS) + { + if(!temperature->cudable) + { + cuda->downloadAll(); + nh_v_temp(); + cuda->cu_v->upload(); + } + else + { + int groupbit_org=temperature->groupbit; + temperature->groupbit=groupbit; + temperature->remove_bias_all(); + Cuda_FixNHCuda_nh_v_temp(&cuda->shared_data,groupbit,factor_eta,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal); + temperature->restore_bias_all(); + temperature->groupbit=groupbit_org; + } + + } + // rescale temperature due to velocity scaling + // should not be necessary to explicitly recompute the temperature + + t_current *= factor_eta*factor_eta; + kecurrent = tdof * boltz * t_current; + eta_dotdot[0] = (kecurrent - lkt)/eta_mass[0]; + + for (ich = 0; ich < mtchain; ich++) + eta[ich] += ncfac*dthalf*eta_dot[ich]; + + eta_dot[0] *= expfac; + eta_dot[0] += eta_dotdot[0] * ncfac*dt4; + eta_dot[0] *= expfac; + + for (ich = 1; ich < mtchain; ich++) { + expfac = exp(-ncfac*dt8*eta_dot[ich+1]); + eta_dot[ich] *= expfac; + eta_dotdot[ich] = (eta_mass[ich-1]*eta_dot[ich-1]*eta_dot[ich-1] + - boltz * t_target)/eta_mass[ich]; + eta_dot[ich] += eta_dotdot[ich] * ncfac*dt4; + eta_dot[ich] *= expfac; + } + } +} + +/* ---------------------------------------------------------------------- + perform half-step update of chain thermostat variables for barostat + scale barostat velocities +------------------------------------------------------------------------- */ + +void FixNHCuda::nhc_press_integrate() +{ + int ich,i; + double expfac,factor_etap,wmass,kecurrent; + double kt = boltz * t_target; + double lkt_press = kt; + + kecurrent = 0.0; + for (i = 0; i < 3; i++) + if (p_flag[i]) kecurrent += omega_mass[i]*omega_dot[i]*omega_dot[i]; + + if (pstyle == TRICLINIC) { + for (i = 3; i < 6; i++) + if (p_flag[i]) kecurrent += omega_mass[i]*omega_dot[i]*omega_dot[i]; + } + + etap_dotdot[0] = (kecurrent - lkt_press)/etap_mass[0]; + + double ncfac = 1.0/nc_pchain; + for (int iloop = 0; iloop < nc_pchain; iloop++) { + + for (ich = mpchain-1; ich > 0; ich--) { + expfac = exp(-ncfac*dt8*etap_dot[ich+1]); + etap_dot[ich] *= expfac; + etap_dot[ich] += etap_dotdot[ich] * ncfac*dt4; + etap_dot[ich] *= pdrag_factor; + etap_dot[ich] *= expfac; + } + + expfac = exp(-ncfac*dt8*etap_dot[1]); + etap_dot[0] *= expfac; + etap_dot[0] += etap_dotdot[0] * ncfac*dt4; + etap_dot[0] *= pdrag_factor; + etap_dot[0] *= expfac; + + for (ich = 0; ich < mpchain; ich++) + etap[ich] += ncfac*dthalf*etap_dot[ich]; + + factor_etap = exp(-ncfac*dthalf*etap_dot[0]); + for (i = 0; i < 3; i++) + if (p_flag[i]) omega_dot[i] *= factor_etap; + + if (pstyle == TRICLINIC) { + for (i = 3; i < 6; i++) + if (p_flag[i]) omega_dot[i] *= factor_etap; + } + + kecurrent = 0.0; + for (i = 0; i < 3; i++) + if (p_flag[i]) kecurrent += omega_mass[i]*omega_dot[i]*omega_dot[i]; + + if (pstyle == TRICLINIC) { + for (i = 3; i < 6; i++) + if (p_flag[i]) kecurrent += omega_mass[i]*omega_dot[i]*omega_dot[i]; + } + + etap_dotdot[0] = (kecurrent - lkt_press)/etap_mass[0]; + + etap_dot[0] *= expfac; + etap_dot[0] += etap_dotdot[0] * ncfac*dt4; + etap_dot[0] *= expfac; + + for (ich = 1; ich < mpchain; ich++) { + expfac = exp(-ncfac*dt8*etap_dot[ich+1]); + etap_dot[ich] *= expfac; + etap_dotdot[ich] = + (etap_mass[ich-1]*etap_dot[ich-1]*etap_dot[ich-1] - boltz*t_target) / + etap_mass[ich]; + etap_dot[ich] += etap_dotdot[ich] * ncfac*dt4; + etap_dot[ich] *= expfac; + } + } +} + +/* ---------------------------------------------------------------------- + perform half-step barostat scaling of velocities +-----------------------------------------------------------------------*/ + +void FixNHCuda::nh_v_press() +{ + double factor[3]; + double **v = atom->v; + int *mask = atom->mask; + int nlocal = atom->nlocal; + if (igroup == atom->firstgroup) nlocal = atom->nfirst; + + factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2)); + factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2)); + factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2)); + + if (which == NOBIAS) { + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + v[i][0] *= factor[0]; + v[i][1] *= factor[1]; + v[i][2] *= factor[2]; + if (pstyle == TRICLINIC) { + v[i][0] += -dthalf*(v[i][1]*omega_dot[5] + v[i][2]*omega_dot[4]); + v[i][1] += -dthalf*v[i][2]*omega_dot[3]; + } + v[i][0] *= factor[0]; + v[i][1] *= factor[1]; + v[i][2] *= factor[2]; + } + } + } else if (which == BIAS) { + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + temperature->remove_bias(i,v[i]); + v[i][0] *= factor[0]; + v[i][1] *= factor[1]; + v[i][2] *= factor[2]; + if (pstyle == TRICLINIC) { + v[i][0] += -dthalf*(v[i][1]*omega_dot[5] + v[i][2]*omega_dot[4]); + v[i][1] += -dthalf*v[i][2]*omega_dot[3]; + } + v[i][0] *= factor[0]; + v[i][1] *= factor[1]; + v[i][2] *= factor[2]; + temperature->restore_bias(i,v[i]); + } + } + } +} + +/* ---------------------------------------------------------------------- + perform half-step update of velocities +-----------------------------------------------------------------------*/ + +void FixNHCuda::nve_v() +{ + double dtfm; + double **v = atom->v; + double **f = atom->f; + double *rmass = atom->rmass; + double *mass = atom->mass; + int *type = atom->type; + int *mask = atom->mask; + int nlocal = atom->nlocal; + if (igroup == atom->firstgroup) nlocal = atom->nfirst; + + if (rmass) { + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + dtfm = dtf / rmass[i]; + v[i][0] += dtfm*f[i][0]; + v[i][1] += dtfm*f[i][1]; + v[i][2] += dtfm*f[i][2]; + } + } + } else { + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + dtfm = dtf / mass[type[i]]; + v[i][0] += dtfm*f[i][0]; + v[i][1] += dtfm*f[i][1]; + v[i][2] += dtfm*f[i][2]; + } + } + } +} + +/* ---------------------------------------------------------------------- + perform full-step update of positions +-----------------------------------------------------------------------*/ + +void FixNHCuda::nve_x() +{ + double **x = atom->x; + double **v = atom->v; + int *mask = atom->mask; + int nlocal = atom->nlocal; + if (igroup == atom->firstgroup) nlocal = atom->nfirst; + + // x update by full step only for atoms in group + + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + x[i][0] += dtv * v[i][0]; + x[i][1] += dtv * v[i][1]; + x[i][2] += dtv * v[i][2]; + } + } +} + +/* ---------------------------------------------------------------------- + perform half-step thermostat scaling of velocities +-----------------------------------------------------------------------*/ + +void FixNHCuda::nh_v_temp() +{ + double **v = atom->v; + int *mask = atom->mask; + int nlocal = atom->nlocal; + if (igroup == atom->firstgroup) nlocal = atom->nfirst; + + if (which == NOBIAS) { + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + v[i][0] *= factor_eta; + v[i][1] *= factor_eta; + v[i][2] *= factor_eta; + } + } + } else if (which == BIAS) { + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + temperature->remove_bias(i,v[i]); + v[i][0] *= factor_eta; + v[i][1] *= factor_eta; + v[i][2] *= factor_eta; + temperature->restore_bias(i,v[i]); + } + } + } +} + +/* ---------------------------------------------------------------------- + compute sigma tensor + needed whenever p_target or h0_inv changes +-----------------------------------------------------------------------*/ + +void FixNHCuda::compute_sigma() +{ + // if nreset_h0 > 0, reset vol0 and h0_inv + // every nreset_h0 timesteps + + if (nreset_h0 > 0) { + int delta = update->ntimestep - update->beginstep; + if (delta % nreset_h0 == 0) { + if (dimension == 3) vol0 = domain->xprd * domain->yprd * domain->zprd; + else vol0 = domain->xprd * domain->yprd; + h0_inv[0] = domain->h_inv[0]; + h0_inv[1] = domain->h_inv[1]; + h0_inv[2] = domain->h_inv[2]; + h0_inv[3] = domain->h_inv[3]; + h0_inv[4] = domain->h_inv[4]; + h0_inv[5] = domain->h_inv[5]; + } + } + + // generate upper-triangular half of + // sigma = vol0*h0inv*(p_target-p_hydro)*h0inv^t + // units of sigma are are PV/L^2 e.g. atm.A + // + // [ 0 5 4 ] [ 0 5 4 ] [ 0 5 4 ] [ 0 - - ] + // [ 5 1 3 ] = [ - 1 3 ] [ 5 1 3 ] [ 5 1 - ] + // [ 4 3 2 ] [ - - 2 ] [ 4 3 2 ] [ 4 3 2 ] + + sigma[0] = + vol0*(h0_inv[0]*((p_target[0]-p_hydro)*h0_inv[0] + + p_target[5]*h0_inv[5]+p_target[4]*h0_inv[4]) + + h0_inv[5]*(p_target[5]*h0_inv[0] + + (p_target[1]-p_hydro)*h0_inv[5]+p_target[3]*h0_inv[4]) + + h0_inv[4]*(p_target[4]*h0_inv[0]+p_target[3]*h0_inv[5] + + (p_target[2]-p_hydro)*h0_inv[4])); + sigma[1] = + vol0*(h0_inv[1]*((p_target[1]-p_hydro)*h0_inv[1] + + p_target[3]*h0_inv[3]) + + h0_inv[3]*(p_target[3]*h0_inv[1] + + (p_target[2]-p_hydro)*h0_inv[3])); + sigma[2] = + vol0*(h0_inv[2]*((p_target[2]-p_hydro)*h0_inv[2])); + sigma[3] = + vol0*(h0_inv[1]*(p_target[3]*h0_inv[2]) + + h0_inv[3]*((p_target[2]-p_hydro)*h0_inv[2])); + sigma[4] = + vol0*(h0_inv[0]*(p_target[4]*h0_inv[2]) + + h0_inv[5]*(p_target[3]*h0_inv[2]) + + h0_inv[4]*((p_target[2]-p_hydro)*h0_inv[2])); + sigma[5] = + vol0*(h0_inv[0]*(p_target[5]*h0_inv[1]+p_target[4]*h0_inv[3]) + + h0_inv[5]*((p_target[1]-p_hydro)*h0_inv[1]+p_target[3]*h0_inv[3]) + + h0_inv[4]*(p_target[3]*h0_inv[1]+(p_target[2]-p_hydro)*h0_inv[3])); +} + +/* ---------------------------------------------------------------------- + compute strain energy +-----------------------------------------------------------------------*/ + +double FixNHCuda::compute_strain_energy() +{ + // compute strain energy = 0.5*Tr(sigma*h*h^t) in energy units + + double* h = domain->h; + double d0,d1,d2; + + d0 = + sigma[0]*(h[0]*h[0]+h[5]*h[5]+h[4]*h[4]) + + sigma[5]*( h[1]*h[5]+h[3]*h[4]) + + sigma[4]*( h[2]*h[4]); + d1 = + sigma[5]*( h[5]*h[1]+h[4]*h[3]) + + sigma[1]*( h[1]*h[1]+h[3]*h[3]) + + sigma[3]*( h[2]*h[3]); + d2 = + sigma[4]*( h[4]*h[2]) + + sigma[3]*( h[3]*h[2]) + + sigma[2]*( h[2]*h[2]); + + double energy = 0.5*(d0+d1+d2)/nktv2p; + return energy; +} + +/* ---------------------------------------------------------------------- + compute deviatoric barostat force = h*sigma*h^t +-----------------------------------------------------------------------*/ + +void FixNHCuda::compute_deviatoric() +{ + // generate upper-triangular part of h*sigma*h^t + // units of fdev are are PV, e.g. atm*A^3 + // [ 0 5 4 ] [ 0 5 4 ] [ 0 5 4 ] [ 0 - - ] + // [ 5 1 3 ] = [ - 1 3 ] [ 5 1 3 ] [ 5 1 - ] + // [ 4 3 2 ] [ - - 2 ] [ 4 3 2 ] [ 4 3 2 ] + + double* h = domain->h; + + fdev[0] = + h[0]*(sigma[0]*h[0]+sigma[5]*h[5]+sigma[4]*h[4]) + + h[5]*(sigma[5]*h[0]+sigma[1]*h[5]+sigma[3]*h[4]) + + h[4]*(sigma[4]*h[0]+sigma[3]*h[5]+sigma[2]*h[4]); + fdev[1] = + h[1]*( sigma[1]*h[1]+sigma[3]*h[3]) + + h[3]*( sigma[3]*h[1]+sigma[2]*h[3]); + fdev[2] = + h[2]*( sigma[2]*h[2]); + fdev[3] = + h[1]*( sigma[3]*h[2]) + + h[3]*( sigma[2]*h[2]); + fdev[4] = + h[0]*( sigma[4]*h[2]) + + h[5]*( sigma[3]*h[2]) + + h[4]*( sigma[2]*h[2]); + fdev[5] = + h[0]*( sigma[5]*h[1]+sigma[4]*h[3]) + + h[5]*( sigma[1]*h[1]+sigma[3]*h[3]) + + h[4]*( sigma[3]*h[1]+sigma[2]*h[3]); +} + +/* ---------------------------------------------------------------------- + compute hydrostatic target pressure +-----------------------------------------------------------------------*/ + +void FixNHCuda::compute_press_target() +{ + double delta = update->ntimestep - update->beginstep; + if (update->endstep > update->beginstep) + delta /= update->endstep - update->beginstep; + else delta = 0.0; + + p_hydro = 0.0; + for (int i = 0; i < 3; i++) + if (p_flag[i]) { + p_target[i] = p_start[i] + delta * (p_stop[i]-p_start[i]); + p_hydro += p_target[i]; + } + p_hydro /= pdim; + + if (pstyle == TRICLINIC) + for (int i = 3; i < 6; i++) + p_target[i] = p_start[i] + delta * (p_stop[i]-p_start[i]); + + // if deviatoric, recompute sigma each time p_target changes + + if (deviatoric_flag) compute_sigma(); +} + +/* ---------------------------------------------------------------------- + update omega_dot, omega, dilation +-----------------------------------------------------------------------*/ + +void FixNHCuda::nh_omega_dot() +{ + double f_omega,volume; + + if (dimension == 3) volume = domain->xprd*domain->yprd*domain->zprd; + else volume = domain->xprd*domain->yprd; + + if (deviatoric_flag) compute_deviatoric(); + + mtk_term1 = 0.0; + if (mtk_flag) + if (pstyle == ISO) { + mtk_term1 = tdof * boltz * t_current; + mtk_term1 /= pdim * atom->natoms; + } else { + double *mvv_current = temperature->vector; + for (int i = 0; i < 3; i++) + if (p_flag[i]) + mtk_term1 += mvv_current[i]; + mtk_term1 /= pdim * atom->natoms; + } + + for (int i = 0; i < 3; i++) + if (p_flag[i]) { + f_omega = (p_current[i]-p_hydro)*volume / + (omega_mass[i] * nktv2p) + mtk_term1 / omega_mass[i]; + if (deviatoric_flag) f_omega -= fdev[i]/(omega_mass[i] * nktv2p); + omega_dot[i] += f_omega*dthalf; + omega_dot[i] *= pdrag_factor; + } + + mtk_term2 = 0.0; + if (mtk_flag) { + for (int i = 0; i < 3; i++) + if (p_flag[i]) + mtk_term2 += omega_dot[i]; + mtk_term2 /= pdim * atom->natoms; + } + + if (pstyle == TRICLINIC) { + for (int i = 3; i < 6; i++) { + if (p_flag[i]) { + f_omega = p_current[i]*volume/(omega_mass[i] * nktv2p); + if (deviatoric_flag) + f_omega -= fdev[i]/(omega_mass[i] * nktv2p); + omega_dot[i] += f_omega*dthalf; + omega_dot[i] *= pdrag_factor; + } + } + } +} + diff --git a/src/USER-CUDA/fix_nh_cuda.h b/src/USER-CUDA/fix_nh_cuda.h new file mode 100644 index 0000000000..8c192b56dd --- /dev/null +++ b/src/USER-CUDA/fix_nh_cuda.h @@ -0,0 +1,126 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_FIX_NH_CUDA_H +#define LMP_FIX_NH_CUDA_H + +#include "fix.h" +#include "cuda_precision.h" + +namespace LAMMPS_NS { + +class FixNHCuda : public Fix { + public: + FixNHCuda(class LAMMPS *, int, char **); + virtual ~FixNHCuda(); + int setmask(); + virtual void init(); + void setup(int); + virtual void initial_integrate(int); + virtual void final_integrate(); + void initial_integrate_respa(int, int, int); + void final_integrate_respa(int, int); + double compute_scalar(); + double compute_vector(int); + void write_restart(FILE *); + void restart(char *); + int modify_param(int, char **); + void reset_dt(); + + protected: + class Cuda *cuda; + int dimension,which; + double dtv,dtf,dthalf,dt4,dt8,dto; + double boltz,nktv2p,tdof; + double vol0,t0; + + double t_start,t_stop; + double t_current,t_target; + double t_freq; + + int tstat_flag; // 1 if control T + int pstat_flag; // 1 if control P + + int pstyle,pcouple,allremap; + int p_flag[6]; // 1 if control P on this dim, 0 if not + double p_start[6],p_stop[6]; + double p_freq[6],p_target[6]; + double omega[6],omega_dot[6]; + double omega_mass[6]; + double p_current[6],dilation[6]; + double drag,tdrag_factor; // drag factor on particle thermostat + double pdrag_factor; // drag factor on barostat + double factor[6]; // velocity scaling due to barostat + int kspace_flag; // 1 if KSpace invoked, 0 if not + int nrigid; // number of rigid fixes + int *rfix; // indices of rigid fixes + + int nlevels_respa; + double *step_respa; + + char *id_temp,*id_press; + class Compute *temperature,*pressure; + int tflag,pflag; + + double *eta,*eta_dot; // chain thermostat for particles + double *eta_dotdot; + double *eta_mass; + int mtchain; // length of chain + + double *etap; // chain thermostat for barostat + double *etap_dot; + double *etap_dotdot; + double *etap_mass; + int mpchain; // length of chain + + int mtk_flag; // 0 if using Hoover barostat + double mtk_term1,mtk_term2; + int mtchain_default_flag; + int pdim; // number of barostatted dims + double mvv_current[3]; // diagonal of KE tensor + double mtk_factor; // MTK factor + double p_freq_max; // maximum barostat frequency + + double p_hydro; // hydrostatic target pressure + + int nc_tchain,nc_pchain; + double factor_eta; + double sigma[6]; // scaled target stress + double fdev[6]; // deviatoric force on barostat + int deviatoric_flag; // 0 if target stress tensor is hydrostatic + double h0_inv[6]; // h_inv of reference (zero strain) box + int nreset_h0; // interval for resetting h0 + + void couple(); + void couple_ke(); + void remap(); + void nhc_temp_integrate(); + void nhc_press_integrate(); + + virtual void nve_x(); // may be overwritten by child classes + virtual void nve_v(); + virtual void nh_v_press(); + virtual void nh_v_temp(); + + void compute_sigma(); + void compute_deviatoric(); + double compute_strain_energy(); + void compute_press_target(); + void nh_omega_dot(); + + X_FLOAT triggerneighsq; +}; + +} + +#endif diff --git a/src/USER-CUDA/fix_npt_cuda.cpp b/src/USER-CUDA/fix_npt_cuda.cpp new file mode 100644 index 0000000000..f254b4b7d1 --- /dev/null +++ b/src/USER-CUDA/fix_npt_cuda.cpp @@ -0,0 +1,71 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include "fix_npt_cuda.h" +#include "modify.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +FixNPTCuda::FixNPTCuda(LAMMPS *lmp, int narg, char **arg) : + FixNHCuda(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (!tstat_flag) + error->all("Temperature control must be used with fix npt"); + if (!pstat_flag) + error->all("Pressure control must be used with fix npt"); + + // create a new compute temp style + // id = fix-ID + temp + // compute group = all since pressure is always global (group all) + // and thus its KE/temperature contribution should use group all + + int n = strlen(id) + 6; + id_temp = new char[n]; + strcpy(id_temp,id); + strcat(id_temp,"_temp"); + + char **newarg = new char*[3]; + newarg[0] = id_temp; + newarg[1] = (char *) "all"; + newarg[2] = (char *) "temp/cuda"; + + modify->add_compute(3,newarg); + delete [] newarg; + tflag = 1; + + // create a new compute pressure style + // id = fix-ID + press, compute group = all + // pass id_temp as 4th arg to pressure constructor + + n = strlen(id) + 7; + id_press = new char[n]; + strcpy(id_press,id); + strcat(id_press,"_press"); + + newarg = new char*[4]; + newarg[0] = id_press; + newarg[1] = (char *) "all"; + newarg[2] = (char *) "pressure/cuda"; + newarg[3] = id_temp; + modify->add_compute(4,newarg); + delete [] newarg; + pflag = 1; +} diff --git a/src/USER-CUDA/fix_npt_cuda.h b/src/USER-CUDA/fix_npt_cuda.h new file mode 100644 index 0000000000..1dc5f5af35 --- /dev/null +++ b/src/USER-CUDA/fix_npt_cuda.h @@ -0,0 +1,36 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(npt/cuda,FixNPTCuda) + +#else + +#ifndef LMP_FIX_NPTCuda_H +#define LMP_FIX_NPTCuda_H + +#include "fix_nh_cuda.h" + +namespace LAMMPS_NS { + +class FixNPTCuda : public FixNHCuda { + public: + FixNPTCuda(class LAMMPS *, int, char **); + ~FixNPTCuda() {} +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_nve_cuda.cpp b/src/USER-CUDA/fix_nve_cuda.cpp new file mode 100644 index 0000000000..367dd7c24e --- /dev/null +++ b/src/USER-CUDA/fix_nve_cuda.cpp @@ -0,0 +1,155 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include +#include "fix_nve_cuda.h" +#include "fix_nve_cuda_cu.h" +#include "atom.h" +#include "force.h" +#include "update.h" +#include "respa.h" +#include "error.h" +#include "cuda.h" +#include "cuda_modify_flags.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +FixNVECuda::FixNVECuda(LAMMPS *lmp, int narg, char **arg) : + Fix(lmp, narg, arg) +{ + cuda = lmp->cuda; + + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (strcmp(style,"nve/sphere") != 0 && narg < 3) + error->all("Illegal fix nve command"); + + time_integrate = 1; +} + +/* ---------------------------------------------------------------------- */ + +int FixNVECuda::setmask() +{ + int mask = 0; + mask |= INITIAL_INTEGRATE_CUDA; + mask |= FINAL_INTEGRATE_CUDA; + // mask |= INITIAL_INTEGRATE_RESPA_CUDA; + // mask |= FINAL_INTEGRATE_RESPA_CUDA; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixNVECuda::init() +{ + dtv = update->dt; + dtf = 0.5 * update->dt * force->ftm2v; + + if (strcmp(update->integrate_style,"respa") == 0) + step_respa = ((Respa *) update->integrate)->step; + + triggerneighsq= cuda->shared_data.atom.triggerneighsq; + cuda->neighbor_decide_by_integrator=1; + Cuda_FixNVECuda_Init(&cuda->shared_data,dtv,dtf); + +} + +/* ---------------------------------------------------------------------- + allow for both per-type and per-atom mass +------------------------------------------------------------------------- */ + +void FixNVECuda::initial_integrate(int vflag) +{ + if(triggerneighsq!=cuda->shared_data.atom.triggerneighsq) + { + triggerneighsq= cuda->shared_data.atom.triggerneighsq; + Cuda_FixNVECuda_Init(&cuda->shared_data,dtv,dtf); + } + int nlocal = atom->nlocal; + if(igroup == atom->firstgroup) nlocal = atom->nfirst; + + Cuda_FixNVECuda_InitialIntegrate(& cuda->shared_data, groupbit,nlocal); +} + +/* ---------------------------------------------------------------------- */ + +void FixNVECuda::final_integrate() +{ + int nlocal = atom->nlocal; + if(igroup == atom->firstgroup) nlocal = atom->nfirst; + + Cuda_FixNVECuda_FinalIntegrate(& cuda->shared_data, groupbit,nlocal); +} + +/* ---------------------------------------------------------------------- */ + +void FixNVECuda::initial_integrate_respa(int vflag, int ilevel, int flag) +{ + //this point should not be reached yet since RESPA is not supported + if (flag) return; // only used by NPT,NPH + + dtv = step_respa[ilevel]; + dtf = 0.5 * step_respa[ilevel] * force->ftm2v; + + // innermost level - NVE update of v and x + // all other levels - NVE update of v + + if(ilevel == 0) initial_integrate(vflag); + else final_integrate(); +} + +/* ---------------------------------------------------------------------- */ + +void FixNVECuda::final_integrate_respa(int ilevel, int iloop) +{ + //this point should not be reached yet since RESPA is not supported + dtf = 0.5 * step_respa[ilevel] * force->ftm2v; + final_integrate(); +} + +/* ---------------------------------------------------------------------- */ + +void FixNVECuda::reset_dt() +{ + dtv = update->dt; + dtf = 0.5 * update->dt * force->ftm2v; + Cuda_FixNVECuda_Init(&cuda->shared_data,dtv,dtf); +} diff --git a/src/USER-CUDA/fix_nve_cuda.h b/src/USER-CUDA/fix_nve_cuda.h new file mode 100644 index 0000000000..6968297610 --- /dev/null +++ b/src/USER-CUDA/fix_nve_cuda.h @@ -0,0 +1,63 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(nve/cuda,FixNVECuda) + +#else + +#ifndef LMP_FIX_NVE_CUDA_H +#define LMP_FIX_NVE_CUDA_H + +#include "fix.h" +#include "cuda_precision.h" + +namespace LAMMPS_NS { + +class FixNVECuda : public Fix +{ + public: + FixNVECuda(class LAMMPS *, int, char **); + int setmask(); + virtual void init(); + virtual void initial_integrate(int); + virtual void final_integrate(); + void initial_integrate_respa(int, int, int); + void final_integrate_respa(int, int); + void reset_dt(); + + X_FLOAT triggerneighsq; + + protected: + class Cuda *cuda; + double dtv, dtf; + double *step_respa; + int mass_require; + +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_nvt_cuda.cpp b/src/USER-CUDA/fix_nvt_cuda.cpp new file mode 100644 index 0000000000..49a3c63013 --- /dev/null +++ b/src/USER-CUDA/fix_nvt_cuda.cpp @@ -0,0 +1,48 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include "fix_nvt_cuda.h" +#include "group.h" +#include "modify.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +FixNVTCuda::FixNVTCuda(LAMMPS *lmp, int narg, char **arg) : + FixNHCuda(lmp, narg, arg) +{ + if (!tstat_flag) + error->all("Temperature control must be used with fix nvt"); + if (pstat_flag) + error->all("Pressure control can not be used with fix nvt"); + + // create a new compute temp style + // id = fix-ID + temp + + int n = strlen(id) + 6; + id_temp = new char[n]; + strcpy(id_temp,id); + strcat(id_temp,"_temp"); + + char **newarg = new char*[3]; + newarg[0] = id_temp; + newarg[1] = group->names[igroup]; + newarg[2] = (char *) "temp/cuda"; + + modify->add_compute(3,newarg); + delete [] newarg; + tflag = 1; +} diff --git a/src/USER-CUDA/fix_nvt_cuda.h b/src/USER-CUDA/fix_nvt_cuda.h new file mode 100644 index 0000000000..02e5ca3d58 --- /dev/null +++ b/src/USER-CUDA/fix_nvt_cuda.h @@ -0,0 +1,36 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(nvt/cuda,FixNVTCuda) + +#else + +#ifndef LMP_FIX_NVTCuda_H +#define LMP_FIX_NVTCuda_H + +#include "fix_nh_cuda.h" + +namespace LAMMPS_NS { + +class FixNVTCuda : public FixNHCuda { + public: + FixNVTCuda(class LAMMPS *, int, char **); + ~FixNVTCuda() {} +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_set_force_cuda.cpp b/src/USER-CUDA/fix_set_force_cuda.cpp new file mode 100644 index 0000000000..8f8c87c82f --- /dev/null +++ b/src/USER-CUDA/fix_set_force_cuda.cpp @@ -0,0 +1,181 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#include +#include +#include "fix_set_force_cuda.h" +#include "fix_set_force_cuda_cu.h" +#include "atom.h" +#include "update.h" +#include "respa.h" +#include "error.h" +#include "cuda.h" +#include "memory.h" +#include "cuda_modify_flags.h" + + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +FixSetForceCuda::FixSetForceCuda(LAMMPS *lmp, int narg, char **arg) : + Fix(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (narg != 6) error->all("Illegal fix setforce/cuda command"); + + vector_flag = 1; + size_vector = 3; + global_freq = 1; + extvector = 1; + + flagx = flagy = flagz = 1; + if (strcmp(arg[3],"NULL") == 0) flagx = 0; + else xvalue = atof(arg[3]); + if (strcmp(arg[4],"NULL") == 0) flagy = 0; + else yvalue = atof(arg[4]); + if (strcmp(arg[5],"NULL") == 0) flagz = 0; + else zvalue = atof(arg[5]); + + force_flag = 0; + foriginal[0] = foriginal[1] = foriginal[2] = 0.0; + cu_foriginal=NULL; +} + +/* ---------------------------------------------------------------------- */ + +int FixSetForceCuda::setmask() +{ + int mask = 0; + mask |= POST_FORCE_CUDA; + mask |= THERMO_ENERGY_CUDA; + mask |= POST_FORCE_RESPA; + mask |= MIN_POST_FORCE_CUDA; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixSetForceCuda::init() +{ + if(not cu_foriginal) + cu_foriginal = new cCudaData (foriginal,3); + if (strcmp(update->integrate_style,"respa") == 0) + nlevels_respa = ((Respa *) update->integrate)->nlevels; +} + +/* ---------------------------------------------------------------------- */ + +void FixSetForceCuda::setup(int vflag) +{ + MYDBG( printf("# CUDA: FixSetForceCuda::setup\n"); ) + + if (strcmp(update->integrate_style,"verlet") == 0) + { + Cuda_FixSetForceCuda_Init(&cuda->shared_data); + cuda->cu_f->upload(); + post_force(vflag); + cuda->cu_f->download(); + + } + else { + ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1); + cuda->cu_f->download(); + post_force_respa(vflag,nlevels_respa-1,0); + cuda->cu_f->upload(); + ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1); + } + MYDBG( printf("# CUDA: FixSetForceCuda::setup done\n"); ) +} + +/* ---------------------------------------------------------------------- */ + +void FixSetForceCuda::min_setup(int vflag) +{ + post_force(vflag); +} + +/* ---------------------------------------------------------------------- */ + +void FixSetForceCuda::post_force(int vflag) +{ + MYDBG( printf("# CUDA: FixSetForceCuda::postforce start\n"); ) + force_flag = 0; + cu_foriginal->memset_device(0); + Cuda_FixSetForceCuda_PostForce(&cuda->shared_data, groupbit, xvalue, yvalue,zvalue,(F_FLOAT*) cu_foriginal->dev_data(),flagx,flagy,flagz); + cu_foriginal->download(); +} + +/* ---------------------------------------------------------------------- */ + +void FixSetForceCuda::post_force_respa(int vflag, int ilevel, int iloop) +{ + if (ilevel == nlevels_respa-1) post_force(vflag); + else { + cuda->cu_f->download(); + cuda->cu_mask->download(); + + double **f = atom->f; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + foriginal[0] = foriginal[1] = foriginal[2] = 0.0; + force_flag = 0; + + for (int i = 0; i < nlocal; i++) + if (mask[i] & groupbit) { + foriginal[0] += f[i][0]; + foriginal[1] += f[i][1]; + foriginal[2] += f[i][2]; + if (flagx) f[i][0] = 0.0; + if (flagy) f[i][1] = 0.0; + if (flagz) f[i][2] = 0.0; + } + cuda->cu_f->upload(); + } +} + +/* ---------------------------------------------------------------------- */ + +void FixSetForceCuda::min_post_force(int vflag) +{ + post_force(vflag); +} + + +/* ---------------------------------------------------------------------- + return components of total force on fix group before force was changed +------------------------------------------------------------------------- */ + +double FixSetForceCuda::compute_vector(int n) +{ + // only sum across procs one time + + if (force_flag == 0) { + MPI_Allreduce(foriginal,foriginal_all,3,MPI_DOUBLE,MPI_SUM,world); + force_flag = 1; + } + return foriginal_all[n+1]; +} diff --git a/src/USER-CUDA/fix_set_force_cuda.h b/src/USER-CUDA/fix_set_force_cuda.h new file mode 100644 index 0000000000..c233294a5b --- /dev/null +++ b/src/USER-CUDA/fix_set_force_cuda.h @@ -0,0 +1,63 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(setforce/cuda,FixSetForceCuda) + +#else + +#ifndef LMP_FIX_SET_FORCE_CUDA_H +#define LMP_FIX_SET_FORCE_CUDA_H + +#include "fix.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class FixSetForceCuda : public Fix { + public: + FixSetForceCuda(class LAMMPS *, int, char **); + int setmask(); + void init(); + void setup(int); + void min_setup(int); + void post_force(int); + void post_force_respa(int, int, int); + void min_post_force(int); + double compute_vector(int); + + private: + class Cuda *cuda; + int flagx,flagy,flagz; + double xvalue,yvalue,zvalue; + double foriginal[3],foriginal_all[3]; + cCudaData* cu_foriginal; + int force_flag; + int nlevels_respa; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_shake_cuda.cpp b/src/USER-CUDA/fix_shake_cuda.cpp new file mode 100644 index 0000000000..dd221e8536 --- /dev/null +++ b/src/USER-CUDA/fix_shake_cuda.cpp @@ -0,0 +1,2619 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include +#include +#include +#include +#include +#include "fix_shake_cuda.h" +#include "fix_shake_cuda_cu.h" +#include "atom.h" +#include "update.h" +#include "respa.h" +#include "modify.h" +#include "domain.h" +#include "force.h" +#include "bond.h" +#include "angle.h" +#include "comm.h" +#include "group.h" +#include "fix_respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" +#include "cuda_modify_flags.h" + +using namespace LAMMPS_NS; + +#define BIG 1.0e20 +#define MASSDELTA 0.1 +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +FixShakeCuda::FixShakeCuda(LAMMPS *lmp, int narg, char **arg) : + Fix(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + cuda->accelerator(0,NULL); + MPI_Comm_rank(world,&me); + MPI_Comm_size(world,&nprocs); + neighbor_step=true; + PI = 4.0*atan(1.0); + + virial_flag = 1; + create_attribute = 1; + + // error check + + if (atom->molecular == 0) + error->all("Cannot use fix shake with non-molecular system"); + + // perform initial allocation of atom-based arrays + // register with Atom class + + shake_flag = NULL; + shake_atom = shake_type = NULL; + xshake = NULL; + cu_shake_flag = NULL; + cu_shake_atom = NULL; + cu_shake_type = NULL; + cu_xshake = NULL; + cu_list = NULL; + cu_bond_distance = NULL; + cu_angle_distance = NULL; + cu_virial = new cCudaData(virial,6); + grow_arrays(atom->nmax); + atom->add_callback(0); + + // set comm size needed by this fix + + comm_forward = 3; + + // parse SHAKE args + + if (narg < 8) error->all("Illegal fix shake command"); + + tolerance = atof(arg[3]); + max_iter = atoi(arg[4]); + output_every = atoi(arg[5]); + + // parse SHAKE args for bond and angle types + // will be used by find_clusters + // store args for "b" "a" "t" as flags in (1:n) list for fast access + // store args for "m" in list of length nmass for looping over + // for "m" verify that atom masses have been set + + bond_flag = new int[atom->nbondtypes+1]; + for (int i = 1; i <= atom->nbondtypes; i++) bond_flag[i] = 0; + angle_flag = new int[atom->nangletypes+1]; + for (int i = 1; i <= atom->nangletypes; i++) angle_flag[i] = 0; + type_flag = new int[atom->ntypes+1]; + for (int i = 1; i <= atom->ntypes; i++) type_flag[i] = 0; + mass_list = new double[atom->ntypes]; + nmass = 0; + + char mode = '\0'; + int next = 6; + while (next < narg) { + + if (strcmp(arg[next],"b") == 0) mode = 'b'; + else if (strcmp(arg[next],"a") == 0) mode = 'a'; + else if (strcmp(arg[next],"t") == 0) mode = 't'; + else if (strcmp(arg[next],"m") == 0) { + mode = 'm'; + atom->check_mass(); + + } else if (mode == 'b') { + int i = atoi(arg[next]); + if (i < 1 || i > atom->nbondtypes) + error->all("Invalid bond type index for fix shake"); + bond_flag[i] = 1; + + } else if (mode == 'a') { + int i = atoi(arg[next]); + if (i < 1 || i > atom->nangletypes) + error->all("Invalid angle type index for fix shake"); + angle_flag[i] = 1; + + } else if (mode == 't') { + int i = atoi(arg[next]); + if (i < 1 || i > atom->ntypes) + error->all("Invalid atom type index for fix shake"); + type_flag[i] = 1; + + } else if (mode == 'm') { + double massone = atof(arg[next]); + if (massone == 0.0) error->all("Invalid atom mass for fix shake"); + if (nmass == atom->ntypes) error->all("Too many masses for fix shake"); + mass_list[nmass++] = massone; + + } else error->all("Illegal fix shake command"); + next++; + } + + // allocate bond and angle distance arrays, indexed from 1 to n + + bond_distance = new double[atom->nbondtypes+1]; + angle_distance = new double[atom->nangletypes+1]; + + cu_bond_distance = new cCudaData (bond_distance, atom->nbondtypes+1); + cu_angle_distance = new cCudaData (angle_distance, atom->nangletypes+1); + + // allocate statistics arrays + + if (output_every) { + int nb = atom->nbondtypes + 1; + b_count = new int[nb]; + b_count_all = new int[nb]; + b_ave = new double[nb]; + b_ave_all = new double[nb]; + b_max = new double[nb]; + b_max_all = new double[nb]; + b_min = new double[nb]; + b_min_all = new double[nb]; + + int na = atom->nangletypes + 1; + a_count = new int[na]; + a_count_all = new int[na]; + a_ave = new double[na]; + a_ave_all = new double[na]; + a_max = new double[na]; + a_max_all = new double[na]; + a_min = new double[na]; + a_min_all = new double[na]; + } + + cudable_comm=true; + // identify all SHAKE clusters + + find_clusters(); + + // initialize list of SHAKE clusters to constrain + + maxlist = 0; + list = NULL; + Cuda_FixShakeCuda_Init(&cuda->shared_data,dtv, dtfsq, + cu_shake_flag->dev_data(),cu_shake_atom->dev_data(),cu_shake_type->dev_data(), cu_xshake->dev_data(), + cu_bond_distance->dev_data(),cu_angle_distance->dev_data(),cu_virial->dev_data(), + max_iter,tolerance); + + +} + +/* ---------------------------------------------------------------------- */ + +FixShakeCuda::~FixShakeCuda() +{ + // unregister callbacks to this fix from Atom class + + atom->delete_callback(id,0); + + // set bond_type and angle_type back to positive for SHAKE clusters + // must set for all SHAKE bonds and angles stored by each atom + + int **bond_type = atom->bond_type; + int **angle_type = atom->angle_type; + int nlocal = atom->nlocal; + + int n; + for (int i = 0; i < nlocal; i++) { + if (shake_flag[i] == 0) continue; + else if (shake_flag[i] == 1) { + n = bondfind(i,shake_atom[i][0],shake_atom[i][1]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + n = bondfind(i,shake_atom[i][0],shake_atom[i][2]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + n = anglefind(i,shake_atom[i][1],shake_atom[i][2]); + if (n >= 0) angle_type[i][n] = -angle_type[i][n]; + } else if (shake_flag[i] == 2) { + n = bondfind(i,shake_atom[i][0],shake_atom[i][1]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + } else if (shake_flag[i] == 3) { + n = bondfind(i,shake_atom[i][0],shake_atom[i][1]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + n = bondfind(i,shake_atom[i][0],shake_atom[i][2]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + } else if (shake_flag[i] == 4) { + n = bondfind(i,shake_atom[i][0],shake_atom[i][1]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + n = bondfind(i,shake_atom[i][0],shake_atom[i][2]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + n = bondfind(i,shake_atom[i][0],shake_atom[i][3]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + } + } + + // delete locally stored arrays + + memory->destroy(shake_flag); + memory->destroy(shake_atom); + memory->destroy(shake_type); + memory->destroy(xshake); + + delete [] bond_flag; + delete [] angle_flag; + delete [] type_flag; + delete [] mass_list; + + delete [] bond_distance; + delete [] angle_distance; + + if (output_every) { + delete [] b_count; + delete [] b_count_all; + delete [] b_ave; + delete [] b_ave_all; + delete [] b_max; + delete [] b_max_all; + delete [] b_min; + delete [] b_min_all; + + delete [] a_count; + delete [] a_count_all; + delete [] a_ave; + delete [] a_ave_all; + delete [] a_max; + delete [] a_max_all; + delete [] a_min; + delete [] a_min_all; + } + + memory->destroy(list); + + delete cu_shake_flag; + delete cu_shake_atom; + delete cu_shake_type; + delete cu_xshake; + delete cu_list; + delete cu_bond_distance; + delete cu_angle_distance; +} + +/* ---------------------------------------------------------------------- */ + +int FixShakeCuda::setmask() +{ + int mask = 0; + mask |= PRE_NEIGHBOR_CUDA; + mask |= POST_FORCE_CUDA; + mask |= POST_FORCE_RESPA; + return mask; +} + +/* ---------------------------------------------------------------------- + set bond and angle distances + this init must happen after force->bond and force->angle inits +------------------------------------------------------------------------- */ + +void FixShakeCuda::init() +{ + int i,m,flag,flag_all,type1,type2,bond1_type,bond2_type; + double rsq,angle; + + // error if more than one shake fix + + int count = 0; + for (i = 0; i < modify->nfix; i++) + if (strcmp(modify->fix[i]->style,"shake") == 0) count++; + if (count > 1) error->all("More than one fix shake"); + + // cannot use with minimization since SHAKE turns off bonds + // that should contribute to potential energy + + if (update->whichflag == 2) + error->all("Fix shake cannot be used with minimization"); + + // error if npt,nph fix comes before shake fix + + for (i = 0; i < modify->nfix; i++) { + if (strcmp(modify->fix[i]->style,"npt") == 0) break; + if (strcmp(modify->fix[i]->style,"nph") == 0) break; + } + if (i < modify->nfix) { + for (int j = i; j < modify->nfix; j++) + if (strcmp(modify->fix[j]->style,"shake") == 0) + error->all("Shake fix must come before NPT/NPH fix"); + } + + // if rRESPA, find associated fix that must exist + // could have changed locations in fix list since created + // set ptrs to rRESPA variables + + if (strcmp(update->integrate_style,"respa") == 0) { + for (i = 0; i < modify->nfix; i++) + if (strcmp(modify->fix[i]->style,"RESPA") == 0) ifix_respa = i; + nlevels_respa = ((Respa *) update->integrate)->nlevels; + loop_respa = ((Respa *) update->integrate)->loop; + step_respa = ((Respa *) update->integrate)->step; + } + + // set equilibrium bond distances + + if (force->bond == NULL) + error->all("Bond potential must be defined for SHAKE"); + for (i = 1; i <= atom->nbondtypes; i++) + bond_distance[i] = force->bond->equilibrium_distance(i); + + // set equilibrium angle distances + + int nlocal = atom->nlocal; + + for (i = 1; i <= atom->nangletypes; i++) { + if (angle_flag[i] == 0) continue; + if (force->angle == NULL) + error->all("Angle potential must be defined for SHAKE"); + + // scan all atoms for a SHAKE angle cluster + // extract bond types for the 2 bonds in the cluster + // bond types must be same in all clusters of this angle type, + // else set error flag + + flag = 0; + bond1_type = bond2_type = 0; + for (m = 0; m < nlocal; m++) { + if (shake_flag[m] != 1) continue; + if (shake_type[m][2] != i) continue; + type1 = MIN(shake_type[m][0],shake_type[m][1]); + type2 = MAX(shake_type[m][0],shake_type[m][1]); + if (bond1_type > 0) { + if (type1 != bond1_type || type2 != bond2_type) { + flag = 1; + break; + } + } + bond1_type = type1; + bond2_type = type2; + } + + // error check for any bond types that are not the same + + MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_MAX,world); + if (flag_all) error->all("Shake angles have different bond types"); + + // insure all procs have bond types + + MPI_Allreduce(&bond1_type,&flag_all,1,MPI_INT,MPI_MAX,world); + bond1_type = flag_all; + MPI_Allreduce(&bond2_type,&flag_all,1,MPI_INT,MPI_MAX,world); + bond2_type = flag_all; + + // if bond types are 0, no SHAKE angles of this type exist + // just skip this angle + + if (bond1_type == 0) { + angle_distance[i] = 0.0; + continue; + } + + // compute the angle distance as a function of 2 bond distances + + angle = force->angle->equilibrium_angle(i); + rsq = 2.0*bond_distance[bond1_type]*bond_distance[bond2_type] * + (1.0-cos(angle)); + angle_distance[i] = sqrt(rsq); + } +} + +/* ---------------------------------------------------------------------- + SHAKE as pre-integrator constraint +------------------------------------------------------------------------- */ + +void FixShakeCuda::setup(int vflag) +{ + pre_neighbor(); + + if (output_every) stats(); + + // setup SHAKE output + + int ntimestep = update->ntimestep; + next_output = ntimestep + output_every; + if (output_every == 0) next_output = update->laststep + 1; + if (output_every && ntimestep % output_every != 0) + next_output = (ntimestep/output_every)*output_every + output_every; + + // half timestep constraint on pre-step, full timestep thereafter + + if (strcmp(update->integrate_style,"verlet") == 0) { + dtv = update->dt; + dtfsq = 0.5 * update->dt * update->dt * force->ftm2v; + post_force(vflag); + dtfsq = update->dt * update->dt * force->ftm2v; + } else { + dtv = step_respa[0]; + dtf_innerhalf = 0.5 * step_respa[0] * force->ftm2v; + dtf_inner = dtf_innerhalf; + ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1); + post_force_respa(vflag,nlevels_respa-1,0); + ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1); + dtf_inner = step_respa[0] * force->ftm2v; + } + Cuda_FixShakeCuda_Init(&cuda->shared_data,dtv, dtfsq, + cu_shake_flag->dev_data(),cu_shake_atom->dev_data(),cu_shake_type->dev_data(), cu_xshake->dev_data(), + cu_bond_distance->dev_data(),cu_angle_distance->dev_data(),cu_virial->dev_data(), + max_iter,tolerance); +} + +/* ---------------------------------------------------------------------- + build list of SHAKE clusters to constrain + if one or more atoms in cluster are on this proc, + this proc lists the cluster exactly once +------------------------------------------------------------------------- */ + +void FixShakeCuda::pre_neighbor() +{ + int atom1,atom2,atom3,atom4; + + // local copies of atom quantities + // used by SHAKE until next re-neighboring + + x = atom->x; + v = atom->v; + f = atom->f; + mass = atom->mass; + rmass = atom->rmass; + type = atom->type; + nlocal = atom->nlocal; + + // extend size of SHAKE list if necessary + + if (nlocal > maxlist) { + maxlist = nlocal; + memory->destroy(list); + memory->create(list,maxlist,"shake:list"); + delete cu_list; cu_list = new cCudaData(list,maxlist); + } + + // build list of SHAKE clusters I compute + + nlist = 0; + int count2=0,count3=0,count4=0,count3a=0; + for (int i = 0; i < nlocal; i++) + if (shake_flag[i]) { + if(shake_flag[i] == 2) count2++; + if(shake_flag[i] == 3) count3++; + if(shake_flag[i] == 4) count4++; + if(shake_flag[i] == 1) count3a++; + + if (shake_flag[i] == 2) { + atom1 = atom->map(shake_atom[i][0]); + atom2 = atom->map(shake_atom[i][1]); + if (atom1 == -1 || atom2 == -1) { + char str[128]; + sprintf(str, + "Shake atoms %d %d missing on proc %d at step " BIGINT_FORMAT, + shake_atom[i][0],shake_atom[i][1],me,update->ntimestep); + error->one(str); + } + if (i <= atom1 && i <= atom2) list[nlist++] = i; + } else if (shake_flag[i] % 2 == 1) { + atom1 = atom->map(shake_atom[i][0]); + atom2 = atom->map(shake_atom[i][1]); + atom3 = atom->map(shake_atom[i][2]); + if (atom1 == -1 || atom2 == -1 || atom3 == -1) { + char str[128]; + sprintf(str, + "Shake atoms %d %d %d missing on proc %d at step " + BIGINT_FORMAT, + shake_atom[i][0],shake_atom[i][1],shake_atom[i][2], + me,update->ntimestep); + error->one(str); + } + if (i <= atom1 && i <= atom2 && i <= atom3) list[nlist++] = i; + } else { + atom1 = atom->map(shake_atom[i][0]); + atom2 = atom->map(shake_atom[i][1]); + atom3 = atom->map(shake_atom[i][2]); + atom4 = atom->map(shake_atom[i][3]); + if (atom1 == -1 || atom2 == -1 || atom3 == -1 || atom4 == -1) { + char str[128]; + sprintf(str, + "Shake atoms %d %d %d %d missing on proc %d at step " + BIGINT_FORMAT, + shake_atom[i][0],shake_atom[i][1], + shake_atom[i][2],shake_atom[i][3], + me,update->ntimestep); + error->one(str); + } + if (i <= atom1 && i <= atom2 && i <= atom3 && i <= atom4) + list[nlist++] = i; + } + } + count2/=2; + count3/=3; + count4/=4; + count3a/=3; + count3+=count2; + count4+=count3; + count3a+=count4; + for(int k = 0,l = count2; k < count2; k++) + { + if(shake_flag[list[k]]!=2) + { + while(shake_flag[list[l]]!=2 && lupload(); + cu_bond_distance->upload(); + cu_angle_distance->upload(); + cu_shake_flag->upload(); + cu_shake_atom->upload(); + cu_shake_type->upload(); + + neighbor_step=true; +} + +/* ---------------------------------------------------------------------- + compute the force adjustment for SHAKE constraint +------------------------------------------------------------------------- */ + +void FixShakeCuda::post_force(int vflag) +{ + timespec starttime; + timespec endtime; + + + if(cuda->finished_setup && neighbor_step) + { + Cuda_FixShakeCuda_Init(&cuda->shared_data,dtv, dtfsq, + cu_shake_flag->dev_data(),cu_shake_atom->dev_data(),cu_shake_type->dev_data(), cu_xshake->dev_data(), + cu_bond_distance->dev_data(),cu_angle_distance->dev_data(),cu_virial->dev_data(), + max_iter,tolerance); + + } + + if(not cuda->finished_setup) + cuda->downloadAll(); + if (update->ntimestep == next_output) + { + if(cuda->finished_setup) + cuda->cu_x->download(); + stats(); + } + + // xshake = unconstrained move with current v,f + + unconstrained_update(); + + // communicate results if necessary + + //if(cuda->finished_setup) cu_xshake->download(); + + if (nprocs > 1) + { + //if(cuda->finished_setup) + //cu_xshake->download(); + comm->forward_comm_fix(this); + //if(cuda->finished_setup) + //cu_xshake->upload(); + } + // virial setup + + if (vflag) v_setup(vflag); + else evflag = 0; + + // loop over clusters + + clock_gettime(CLOCK_REALTIME,&starttime); + if(cuda->finished_setup) + { + cu_virial->upload(); + if(vflag_atom) cuda->cu_vatom->upload(); + + Cuda_FixShakeCuda_Shake(&cuda->shared_data,vflag,vflag_atom,(int*)cu_list->dev_data(),nlist); + cu_virial->download(); + if(vflag_atom) cuda->cu_vatom->download(); + + } + else + for (int i = 0; i < nlist; i++) { + int m = list[i]; + if (shake_flag[m] == 2) shake2(m); + else if (shake_flag[m] == 3) shake3(m); + else if (shake_flag[m] == 4) shake4(m); + else shake3angle(m); + } + if((not cuda->finished_setup)) cuda->cu_f->upload(); + clock_gettime(CLOCK_REALTIME,&endtime); + if(cuda->finished_setup) + time_postforce+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); + else + time_postforce=0.0; + //printf("Postforce time: %lf\n",time_postforce); +} + +/* ---------------------------------------------------------------------- + count # of degrees-of-freedom removed by SHAKE for atoms in igroup +------------------------------------------------------------------------- */ + +int FixShakeCuda::dof(int igroup) +{ + int groupbit = group->bitmask[igroup]; + + int *mask = atom->mask; + int *tag = atom->tag; + int nlocal = atom->nlocal; + + // count dof in a cluster if and only if + // the central atom is in group and atom i is the central atom + + int n = 0; + for (int i = 0; i < nlocal; i++) { + if (!(mask[i] & groupbit)) continue; + if (shake_flag[i] == 0) continue; + if (shake_atom[i][0] != tag[i]) continue; + if (shake_flag[i] == 1) n += 3; + else if (shake_flag[i] == 2) n += 1; + else if (shake_flag[i] == 3) n += 2; + else if (shake_flag[i] == 4) n += 3; + } + + int nall; + MPI_Allreduce(&n,&nall,1,MPI_INT,MPI_SUM,world); + return nall; +} + +/* ---------------------------------------------------------------------- + identify whether each atom is in a SHAKE cluster + only include atoms in fix group and those bonds/angles specified in input + test whether all clusters are valid + set shake_flag, shake_atom, shake_type values + set bond,angle types negative so will be ignored in neighbor lists +------------------------------------------------------------------------- */ + +void FixShakeCuda::find_clusters() +{ + int i,j,m,n; + int flag,flag_all,messtag,loop,nbuf,nbufmax,size; + double massone; + int *buf,*bufcopy; + MPI_Request request; + MPI_Status status; + + if (me == 0 && screen) fprintf(screen,"Finding SHAKE clusters ...\n"); + + // local copies of atom ptrs + + int *tag = atom->tag; + int *type = atom->type; + int *mask = atom->mask; + double *mass = atom->mass; + double *rmass = atom->rmass; + int **bond_type = atom->bond_type; + int **angle_type = atom->angle_type; + int **nspecial = atom->nspecial; + int **special = atom->special; + int nlocal = atom->nlocal; + + // setup ring of procs + + int next = me + 1; + int prev = me -1; + if (next == nprocs) next = 0; + if (prev < 0) prev = nprocs - 1; + + // ----------------------------------------------------- + // allocate arrays for self (1d) and bond partners (2d) + // max = max # of bond partners for owned atoms = 2nd dim of partner arrays + // npartner[i] = # of bonds attached to atom i + // nshake[i] = # of SHAKE bonds attached to atom i + // partner_tag[i][] = global IDs of each partner + // partner_mask[i][] = mask of each partner + // partner_type[i][] = type of each partner + // partner_massflag[i][] = 1 if partner meets mass criterion, 0 if not + // partner_bondtype[i][] = type of bond attached to each partner + // partner_shake[i][] = 1 if SHAKE bonded to partner, 0 if not + // partner_nshake[i][] = nshake value for each partner + // ----------------------------------------------------- + + int max = 0; + for (i = 0; i < nlocal; i++) max = MAX(max,nspecial[i][0]); + + int *npartner,*nshake; + memory->create(npartner,nlocal,"shake:npartner"); + memory->create(nshake,nlocal,"shake:nshake"); + + int **partner_tag,**partner_mask,**partner_type,**partner_massflag; + int ** partner_bondtype,**partner_shake,**partner_nshake; + memory->create(partner_tag,nlocal,max,"shake:partner_tag"); + memory->create(partner_mask,nlocal,max,"shake:partner_mask"); + memory->create(partner_type,nlocal,max,"shake:partner_type"); + memory->create(partner_massflag,nlocal,max,"shake:partner_massflag"); + memory->create(partner_bondtype,nlocal,max,"shake:partner_bondtype"); + memory->create(partner_shake,nlocal,max,"shake:partner_shake"); + memory->create(partner_nshake,nlocal,max,"shake:partner_nshake"); + + // ----------------------------------------------------- + // set npartner and partner_tag from special arrays + // ----------------------------------------------------- + + for (i = 0; i < nlocal; i++) { + npartner[i] = nspecial[i][0]; + for (j = 0; j < npartner[i]; j++) partner_tag[i][j] = special[i][j]; + } + + // ----------------------------------------------------- + // set partner_mask, partner_type, partner_massflag, partner_bondtype + // for bonded partners + // requires communication for off-proc partners + // ----------------------------------------------------- + + // fill in mask, type, massflag, bondtype if own bond partner + // info to store in buf for each off-proc bond = nper = 6 + // 2 atoms IDs in bond, space for mask, type, massflag, bondtype + // nbufmax = largest buffer needed to hold info from any proc + + int nper = 6; + + nbuf = 0; + for (i = 0; i < nlocal; i++) { + for (j = 0; j < npartner[i]; j++) { + partner_mask[i][j] = 0; + partner_type[i][j] = 0; + partner_massflag[i][j] = 0; + partner_bondtype[i][j] = 0; + + m = atom->map(partner_tag[i][j]); + if (m >= 0 && m < nlocal) { + partner_mask[i][j] = mask[m]; + partner_type[i][j] = type[m]; + if (nmass) { + if (rmass) massone = rmass[m]; + else massone = mass[type[m]]; + partner_massflag[i][j] = masscheck(massone); + } + n = bondfind(i,tag[i],partner_tag[i][j]); + if (n >= 0) partner_bondtype[i][j] = bond_type[i][n]; + else { + n = bondfind(m,tag[i],partner_tag[i][j]); + if (n >= 0) partner_bondtype[i][j] = bond_type[m][n]; + } + } else nbuf += nper; + } + } + + MPI_Allreduce(&nbuf,&nbufmax,1,MPI_INT,MPI_MAX,world); + + buf = new int[nbufmax]; + bufcopy = new int[nbufmax]; + + // fill buffer with info + + size = 0; + for (i = 0; i < nlocal; i++) { + for (j = 0; j < npartner[i]; j++) { + m = atom->map(partner_tag[i][j]); + if (m < 0 || m >= nlocal) { + buf[size] = tag[i]; + buf[size+1] = partner_tag[i][j]; + buf[size+2] = 0; + buf[size+3] = 0; + buf[size+4] = 0; + n = bondfind(i,tag[i],partner_tag[i][j]); + if (n >= 0) buf[size+5] = bond_type[i][n]; + else buf[size+5] = 0; + size += nper; + } + } + } + + // cycle buffer around ring of procs back to self + // when receive buffer, scan bond partner IDs for atoms I own + // if I own partner: + // fill in mask and type and massflag + // search for bond with 1st atom and fill in bondtype + + messtag = 1; + for (loop = 0; loop < nprocs; loop++) { + i = 0; + while (i < size) { + m = atom->map(buf[i+1]); + if (m >= 0 && m < nlocal) { + buf[i+2] = mask[m]; + buf[i+3] = type[m]; + if (nmass) { + if (rmass) massone = rmass[m]; + else massone = mass[type[m]]; + buf[i+4] = masscheck(massone); + } + if (buf[i+5] == 0) { + n = bondfind(m,buf[i],buf[i+1]); + if (n >= 0) buf[i+5] = bond_type[m][n]; + } + } + i += nper; + } + if (me != next) { + MPI_Irecv(bufcopy,nbufmax,MPI_INT,prev,messtag,world,&request); + MPI_Send(buf,size,MPI_INT,next,messtag,world); + MPI_Wait(&request,&status); + MPI_Get_count(&status,MPI_INT,&size); + for (j = 0; j < size; j++) buf[j] = bufcopy[j]; + } + } + + // store partner info returned to me + + m = 0; + while (m < size) { + i = atom->map(buf[m]); + for (j = 0; j < npartner[i]; j++) + if (buf[m+1] == partner_tag[i][j]) break; + partner_mask[i][j] = buf[m+2]; + partner_type[i][j] = buf[m+3]; + partner_massflag[i][j] = buf[m+4]; + partner_bondtype[i][j] = buf[m+5]; + m += nper; + } + + delete [] buf; + delete [] bufcopy; + + // error check for unfilled partner info + // if partner_type not set, is an error + // partner_bondtype may not be set if special list is not consistent + // with bondatom (e.g. due to delete_bonds command) + // this is OK if one or both atoms are not in fix group, since + // bond won't be SHAKEn anyway + // else it's an error + + flag = 0; + for (i = 0; i < nlocal; i++) + for (j = 0; j < npartner[i]; j++) { + if (partner_type[i][j] == 0) flag = 1; + if (!(mask[i] & groupbit)) continue; + if (!(partner_mask[i][j] & groupbit)) continue; + if (partner_bondtype[i][j] == 0) flag = 1; + } + + MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world); + if (flag_all) error->all("Did not find fix shake partner info"); + + // ----------------------------------------------------- + // identify SHAKEable bonds + // set nshake[i] = # of SHAKE bonds attached to atom i + // set partner_shake[i][] = 1 if SHAKE bonded to partner, 0 if not + // both atoms must be in group, bondtype must be > 0 + // check if bondtype is in input bond_flag + // check if type of either atom is in input type_flag + // check if mass of either atom is in input mass_list + // ----------------------------------------------------- + + int np; + + for (i = 0; i < nlocal; i++) { + nshake[i] = 0; + np = npartner[i]; + for (j = 0; j < np; j++) { + partner_shake[i][j] = 0; + + if (!(mask[i] & groupbit)) continue; + if (!(partner_mask[i][j] & groupbit)) continue; + if (partner_bondtype[i][j] <= 0) continue; + + if (bond_flag[partner_bondtype[i][j]]) { + partner_shake[i][j] = 1; + nshake[i]++; + continue; + } + if (type_flag[type[i]] || type_flag[partner_type[i][j]]) { + partner_shake[i][j] = 1; + nshake[i]++; + continue; + } + if (nmass) { + if (partner_massflag[i][j]) { + partner_shake[i][j] = 1; + nshake[i]++; + continue; + } else { + if (rmass) massone = rmass[i]; + else massone = mass[type[i]]; + if (masscheck(massone)) { + partner_shake[i][j] = 1; + nshake[i]++; + continue; + } + } + } + } + } + + // ----------------------------------------------------- + // set partner_nshake for bonded partners + // requires communication for off-proc partners + // ----------------------------------------------------- + + // fill in partner_nshake if own bond partner + // info to store in buf for each off-proc bond = + // 2 atoms IDs in bond, space for nshake value + // nbufmax = largest buffer needed to hold info from any proc + + nbuf = 0; + for (i = 0; i < nlocal; i++) { + for (j = 0; j < npartner[i]; j++) { + m = atom->map(partner_tag[i][j]); + if (m >= 0 && m < nlocal) partner_nshake[i][j] = nshake[m]; + else nbuf += 3; + } + } + + MPI_Allreduce(&nbuf,&nbufmax,1,MPI_INT,MPI_MAX,world); + + buf = new int[nbufmax]; + bufcopy = new int[nbufmax]; + + // fill buffer with info + + size = 0; + for (i = 0; i < nlocal; i++) { + for (j = 0; j < npartner[i]; j++) { + m = atom->map(partner_tag[i][j]); + if (m < 0 || m >= nlocal) { + buf[size] = tag[i]; + buf[size+1] = partner_tag[i][j]; + size += 3; + } + } + } + + // cycle buffer around ring of procs back to self + // when receive buffer, scan bond partner IDs for atoms I own + // if I own partner, fill in nshake value + + messtag = 2; + for (loop = 0; loop < nprocs; loop++) { + i = 0; + while (i < size) { + m = atom->map(buf[i+1]); + if (m >= 0 && m < nlocal) buf[i+2] = nshake[m]; + i += 3; + } + if (me != next) { + MPI_Irecv(bufcopy,nbufmax,MPI_INT,prev,messtag,world,&request); + MPI_Send(buf,size,MPI_INT,next,messtag,world); + MPI_Wait(&request,&status); + MPI_Get_count(&status,MPI_INT,&size); + for (j = 0; j < size; j++) buf[j] = bufcopy[j]; + } + } + + // store partner info returned to me + + m = 0; + while (m < size) { + i = atom->map(buf[m]); + for (j = 0; j < npartner[i]; j++) + if (buf[m+1] == partner_tag[i][j]) break; + partner_nshake[i][j] = buf[m+2]; + m += 3; + } + + delete [] buf; + delete [] bufcopy; + + // ----------------------------------------------------- + // error checks + // no atom with nshake > 3 + // no connected atoms which both have nshake > 1 + // ----------------------------------------------------- + + flag = 0; + for (i = 0; i < nlocal; i++) if (nshake[i] > 3) flag = 1; + MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world); + if (flag_all) error->all("Shake cluster of more than 4 atoms"); + + flag = 0; + for (i = 0; i < nlocal; i++) { + if (nshake[i] <= 1) continue; + for (j = 0; j < npartner[i]; j++) + if (partner_shake[i][j] && partner_nshake[i][j] > 1) flag = 1; + } + MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world); + if (flag_all) error->all("Shake clusters are connected"); + + // ----------------------------------------------------- + // set SHAKE arrays that are stored with atoms & add angle constraints + // zero shake arrays for all owned atoms + // if I am central atom set shake_flag & shake_atom & shake_type + // for 2-atom clusters, I am central atom if my atom ID < partner ID + // for 3-atom clusters, test for angle constraint + // angle will be stored by this atom if it exists + // if angle type matches angle_flag, then it is angle-constrained + // shake_flag[] = 0 if atom not in SHAKE cluster + // 2,3,4 = size of bond-only cluster + // 1 = 3-atom angle cluster + // shake_atom[][] = global IDs of 2,3,4 atoms in cluster + // central atom is 1st + // for 2-atom cluster, lowest ID is 1st + // shake_type[][] = bondtype of each bond in cluster + // for 3-atom angle cluster, 3rd value is angletype + // ----------------------------------------------------- + + for (i = 0; i < nlocal; i++) { + shake_flag[i] = 0; + shake_atom[i][0] = 0; + shake_atom[i][1] = 0; + shake_atom[i][2] = 0; + shake_atom[i][3] = 0; + shake_type[i][0] = 0; + shake_type[i][1] = 0; + shake_type[i][2] = 0; + + if (nshake[i] == 1) { + for (j = 0; j < npartner[i]; j++) + if (partner_shake[i][j]) break; + if (partner_nshake[i][j] == 1 && tag[i] < partner_tag[i][j]) { + shake_flag[i] = 2; + shake_atom[i][0] = tag[i]; + shake_atom[i][1] = partner_tag[i][j]; + shake_type[i][0] = partner_bondtype[i][j]; + } + } + + if (nshake[i] > 1) { + shake_flag[i] = 1; + shake_atom[i][0] = tag[i]; + for (j = 0; j < npartner[i]; j++) + if (partner_shake[i][j]) { + m = shake_flag[i]; + shake_atom[i][m] = partner_tag[i][j]; + shake_type[i][m-1] = partner_bondtype[i][j]; + shake_flag[i]++; + } + } + + if (nshake[i] == 2) { + n = anglefind(i,shake_atom[i][1],shake_atom[i][2]); + if (n < 0) continue; + if (angle_type[i][n] < 0) continue; + if (angle_flag[angle_type[i][n]]) { + shake_flag[i] = 1; + shake_type[i][2] = angle_type[i][n]; + } + } + } + + // ----------------------------------------------------- + // set shake_flag,shake_atom,shake_type for non-central atoms + // requires communication for off-proc atoms + // ----------------------------------------------------- + + // fill in shake arrays for each bond partner I own + // info to store in buf for each off-proc bond = + // all values from shake_flag, shake_atom, shake_type + // nbufmax = largest buffer needed to hold info from any proc + + nbuf = 0; + for (i = 0; i < nlocal; i++) { + if (shake_flag[i] == 0) continue; + for (j = 0; j < npartner[i]; j++) { + if (partner_shake[i][j] == 0) continue; + m = atom->map(partner_tag[i][j]); + if (m >= 0 && m < nlocal) { + shake_flag[m] = shake_flag[i]; + shake_atom[m][0] = shake_atom[i][0]; + shake_atom[m][1] = shake_atom[i][1]; + shake_atom[m][2] = shake_atom[i][2]; + shake_atom[m][3] = shake_atom[i][3]; + shake_type[m][0] = shake_type[i][0]; + shake_type[m][1] = shake_type[i][1]; + shake_type[m][2] = shake_type[i][2]; + } else nbuf += 9; + } + } + + MPI_Allreduce(&nbuf,&nbufmax,1,MPI_INT,MPI_MAX,world); + + buf = new int[nbufmax]; + bufcopy = new int[nbufmax]; + + // fill buffer with info + + size = 0; + for (i = 0; i < nlocal; i++) { + if (shake_flag[i] == 0) continue; + for (j = 0; j < npartner[i]; j++) { + if (partner_shake[i][j] == 0) continue; + m = atom->map(partner_tag[i][j]); + if (m < 0 || m >= nlocal) { + buf[size] = partner_tag[i][j]; + buf[size+1] = shake_flag[i]; + buf[size+2] = shake_atom[i][0]; + buf[size+3] = shake_atom[i][1]; + buf[size+4] = shake_atom[i][2]; + buf[size+5] = shake_atom[i][3]; + buf[size+6] = shake_type[i][0]; + buf[size+7] = shake_type[i][1]; + buf[size+8] = shake_type[i][2]; + size += 9; + } + } + } + + // cycle buffer around ring of procs back to self + // when receive buffer, scan for ID that I own + // if I own ID, fill in shake array values + + messtag = 3; + for (loop = 0; loop < nprocs; loop++) { + i = 0; + while (i < size) { + m = atom->map(buf[i]); + if (m >= 0 && m < nlocal) { + shake_flag[m] = buf[i+1]; + shake_atom[m][0] = buf[i+2]; + shake_atom[m][1] = buf[i+3]; + shake_atom[m][2] = buf[i+4]; + shake_atom[m][3] = buf[i+5]; + shake_type[m][0] = buf[i+6]; + shake_type[m][1] = buf[i+7]; + shake_type[m][2] = buf[i+8]; + } + i += 9; + } + if (me != next) { + MPI_Irecv(bufcopy,nbufmax,MPI_INT,prev,messtag,world,&request); + MPI_Send(buf,size,MPI_INT,next,messtag,world); + MPI_Wait(&request,&status); + MPI_Get_count(&status,MPI_INT,&size); + for (j = 0; j < size; j++) buf[j] = bufcopy[j]; + } + } + + delete [] buf; + delete [] bufcopy; + + // ----------------------------------------------------- + // free local memory + // ----------------------------------------------------- + + memory->destroy(npartner); + memory->destroy(nshake); + memory->destroy(partner_tag); + memory->destroy(partner_mask); + memory->destroy(partner_type); + memory->destroy(partner_massflag); + memory->destroy(partner_bondtype); + memory->destroy(partner_shake); + memory->destroy(partner_nshake); + + // ----------------------------------------------------- + // set bond_type and angle_type negative for SHAKE clusters + // must set for all SHAKE bonds and angles stored by each atom + // ----------------------------------------------------- + + for (i = 0; i < nlocal; i++) { + if (shake_flag[i] == 0) continue; + else if (shake_flag[i] == 1) { + n = bondfind(i,shake_atom[i][0],shake_atom[i][1]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + n = bondfind(i,shake_atom[i][0],shake_atom[i][2]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + n = anglefind(i,shake_atom[i][1],shake_atom[i][2]); + if (n >= 0) angle_type[i][n] = -angle_type[i][n]; + } else if (shake_flag[i] == 2) { + n = bondfind(i,shake_atom[i][0],shake_atom[i][1]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + } else if (shake_flag[i] == 3) { + n = bondfind(i,shake_atom[i][0],shake_atom[i][1]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + n = bondfind(i,shake_atom[i][0],shake_atom[i][2]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + } else if (shake_flag[i] == 4) { + n = bondfind(i,shake_atom[i][0],shake_atom[i][1]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + n = bondfind(i,shake_atom[i][0],shake_atom[i][2]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + n = bondfind(i,shake_atom[i][0],shake_atom[i][3]); + if (n >= 0) bond_type[i][n] = -bond_type[i][n]; + } + } + + // ----------------------------------------------------- + // print info on SHAKE clusters + // ----------------------------------------------------- + + int count1,count2,count3,count4; + count1 = count2 = count3 = count4 = 0; + for (i = 0; i < nlocal; i++) { + if (shake_flag[i] == 1) count1++; + else if (shake_flag[i] == 2) count2++; + else if (shake_flag[i] == 3) count3++; + else if (shake_flag[i] == 4) count4++; + } + + for(int i=0;iupload(); + cu_shake_atom->upload(); + cu_shake_type->upload(); + Cuda_FixShakeCuda_Init(&cuda->shared_data,dtv, dtfsq, + cu_shake_flag->dev_data(),cu_shake_atom->dev_data(),cu_shake_type->dev_data(), cu_xshake->dev_data(), + cu_bond_distance->dev_data(),cu_angle_distance->dev_data(),cu_virial->dev_data(), + max_iter,tolerance); + +} + +void FixShakeCuda::swap_clusters(int i, int j) +{ + int tmp; + tmp = shake_flag[i]; shake_flag[i] = shake_flag[j]; shake_flag[j] = tmp; + tmp = shake_atom[i][0]; shake_atom[i][0] = shake_atom[j][0]; shake_atom[j][0] = tmp; + tmp = shake_atom[i][1]; shake_atom[i][1] = shake_atom[j][1]; shake_atom[j][1] = tmp; + tmp = shake_atom[i][2]; shake_atom[i][2] = shake_atom[j][2]; shake_atom[j][2] = tmp; + tmp = shake_atom[i][3]; shake_atom[i][3] = shake_atom[j][3]; shake_atom[j][3] = tmp; + tmp = shake_type[i][0]; shake_type[i][0] = shake_type[j][0]; shake_type[j][0] = tmp; + tmp = shake_type[i][1]; shake_type[i][1] = shake_type[j][1]; shake_type[j][1] = tmp; + tmp = shake_type[i][2]; shake_type[i][2] = shake_type[j][2]; shake_type[j][2] = tmp; +} + +/* ---------------------------------------------------------------------- + check if massone is within MASSDELTA of any mass in mass_list + return 1 if yes, 0 if not +------------------------------------------------------------------------- */ + +int FixShakeCuda::masscheck(double massone) +{ + for (int i = 0; i < nmass; i++) + if (fabs(mass_list[i]-massone) <= MASSDELTA) return 1; + return 0; +} + +/* ---------------------------------------------------------------------- + update the unconstrained position of each atom + only for SHAKE clusters, else set to 0.0 + assumes NVE update, seems to be accurate enough for NVT,NPT,NPH as well +------------------------------------------------------------------------- */ + +void FixShakeCuda::unconstrained_update() +{ + if(cuda->finished_setup) + { + Cuda_FixShakeCuda_UnconstrainedUpdate(&cuda->shared_data); + return; + } + + double dtfmsq; + + if (rmass) { + for (int i = 0; i < nlocal; i++) { + if (shake_flag[i]) { + dtfmsq = dtfsq / rmass[i]; + xshake[i][0] = x[i][0] + dtv*v[i][0] + dtfmsq*f[i][0]; + xshake[i][1] = x[i][1] + dtv*v[i][1] + dtfmsq*f[i][1]; + xshake[i][2] = x[i][2] + dtv*v[i][2] + dtfmsq*f[i][2]; + } else xshake[i][2] = xshake[i][1] = xshake[i][0] = 0.0; + } + } else { + for (int i = 0; i < nlocal; i++) { + if (shake_flag[i]) { + dtfmsq = dtfsq / mass[type[i]]; + xshake[i][0] = x[i][0] + dtv*v[i][0] + dtfmsq*f[i][0]; + xshake[i][1] = x[i][1] + dtv*v[i][1] + dtfmsq*f[i][1]; + xshake[i][2] = x[i][2] + dtv*v[i][2] + dtfmsq*f[i][2]; + } else xshake[i][2] = xshake[i][1] = xshake[i][0] = 0.0; + } + } + cu_xshake->upload(); +} + +/* ---------------------------------------------------------------------- */ + +void FixShakeCuda::shake2(int m) +{ + int nlist,list[2]; + double v[6]; + double invmass0,invmass1; + + // local atom IDs and constraint distances + + int i0 = atom->map(shake_atom[m][0]); + int i1 = atom->map(shake_atom[m][1]); + double bond1 = bond_distance[shake_type[m][0]]; + + // r01 = distance vec between atoms, with PBC + + double r01[3]; + r01[0] = x[i0][0] - x[i1][0]; + r01[1] = x[i0][1] - x[i1][1]; + r01[2] = x[i0][2] - x[i1][2]; + domain->minimum_image(r01); + + // s01 = distance vec after unconstrained update, with PBC + + double s01[3]; + s01[0] = xshake[i0][0] - xshake[i1][0]; + s01[1] = xshake[i0][1] - xshake[i1][1]; + s01[2] = xshake[i0][2] - xshake[i1][2]; + domain->minimum_image(s01); + + // scalar distances between atoms + + double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2]; + double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2]; + + // a,b,c = coeffs in quadratic equation for lamda + + if (rmass) { + invmass0 = 1.0/rmass[i0]; + invmass1 = 1.0/rmass[i1]; + } else { + invmass0 = 1.0/mass[type[i0]]; + invmass1 = 1.0/mass[type[i1]]; + } + + double a = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; + double b = 2.0 * (invmass0+invmass1) * + (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]); + double c = s01sq - bond1*bond1; + + // error check + + double determ = b*b - 4.0*a*c; + if (determ < 0.0) { + error->warning("Shake determinant < 0.0"); + determ = 0.0; + } + + // exact quadratic solution for lamda + + double lamda,lamda1,lamda2; + lamda1 = (-b+sqrt(determ)) / (2.0*a); + lamda2 = (-b-sqrt(determ)) / (2.0*a); + + if (fabs(lamda1) <= fabs(lamda2)) lamda = lamda1; + else lamda = lamda2; + + // update forces if atom is owned by this processor + lamda /= dtfsq; + + if (i0 < nlocal) { + f[i0][0] += lamda*r01[0]; + f[i0][1] += lamda*r01[1]; + f[i0][2] += lamda*r01[2]; + } + + if (i1 < nlocal) { + f[i1][0] -= lamda*r01[0]; + f[i1][1] -= lamda*r01[1]; + f[i1][2] -= lamda*r01[2]; + } + + if (evflag) { + nlist = 0; + if (i0 < nlocal) list[nlist++] = i0; + if (i1 < nlocal) list[nlist++] = i1; + + v[0] = lamda*r01[0]*r01[0]; + v[1] = lamda*r01[1]*r01[1]; + v[2] = lamda*r01[2]*r01[2]; + v[3] = lamda*r01[0]*r01[1]; + v[4] = lamda*r01[0]*r01[2]; + v[5] = lamda*r01[1]*r01[2]; + + v_tally(nlist,list,2.0,v); + } +} + +/* ---------------------------------------------------------------------- */ + +void FixShakeCuda::shake3(int m) +{ + int nlist,list[3]; + double v[6]; + double invmass0,invmass1,invmass2; + + // local atom IDs and constraint distances + + int i0 = atom->map(shake_atom[m][0]); + int i1 = atom->map(shake_atom[m][1]); + int i2 = atom->map(shake_atom[m][2]); + double bond1 = bond_distance[shake_type[m][0]]; + double bond2 = bond_distance[shake_type[m][1]]; + + // r01,r02 = distance vec between atoms, with PBC + + double r01[3]; + r01[0] = x[i0][0] - x[i1][0]; + r01[1] = x[i0][1] - x[i1][1]; + r01[2] = x[i0][2] - x[i1][2]; + domain->minimum_image(r01); + + double r02[3]; + r02[0] = x[i0][0] - x[i2][0]; + r02[1] = x[i0][1] - x[i2][1]; + r02[2] = x[i0][2] - x[i2][2]; + domain->minimum_image(r02); + + // s01,s02 = distance vec after unconstrained update, with PBC + + double s01[3]; + s01[0] = xshake[i0][0] - xshake[i1][0]; + s01[1] = xshake[i0][1] - xshake[i1][1]; + s01[2] = xshake[i0][2] - xshake[i1][2]; + domain->minimum_image(s01); + + double s02[3]; + s02[0] = xshake[i0][0] - xshake[i2][0]; + s02[1] = xshake[i0][1] - xshake[i2][1]; + s02[2] = xshake[i0][2] - xshake[i2][2]; + domain->minimum_image(s02); + + // scalar distances between atoms + + double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2]; + double r02sq = r02[0]*r02[0] + r02[1]*r02[1] + r02[2]*r02[2]; + double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2]; + double s02sq = s02[0]*s02[0] + s02[1]*s02[1] + s02[2]*s02[2]; + + // matrix coeffs and rhs for lamda equations + + if (rmass) { + invmass0 = 1.0/rmass[i0]; + invmass1 = 1.0/rmass[i1]; + invmass2 = 1.0/rmass[i2]; + } else { + invmass0 = 1.0/mass[type[i0]]; + invmass1 = 1.0/mass[type[i1]]; + invmass2 = 1.0/mass[type[i2]]; + } + + double a11 = 2.0 * (invmass0+invmass1) * + (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]); + double a12 = 2.0 * invmass0 * + (s01[0]*r02[0] + s01[1]*r02[1] + s01[2]*r02[2]); + double a21 = 2.0 * invmass0 * + (s02[0]*r01[0] + s02[1]*r01[1] + s02[2]*r01[2]); + double a22 = 2.0 * (invmass0+invmass2) * + (s02[0]*r02[0] + s02[1]*r02[1] + s02[2]*r02[2]); + + // inverse of matrix + + double determ = a11*a22 - a12*a21; + if (determ == 0.0) error->one("Shake determinant = 0.0"); + double determinv = 1.0/determ; + + double a11inv = a22*determinv; + double a12inv = -a12*determinv; + double a21inv = -a21*determinv; + double a22inv = a11*determinv; + + // quadratic correction coeffs + + double r0102 = (r01[0]*r02[0] + r01[1]*r02[1] + r01[2]*r02[2]); + + double quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; + double quad1_0202 = invmass0*invmass0 * r02sq; + double quad1_0102 = 2.0 * (invmass0+invmass1)*invmass0 * r0102; + + double quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq; + double quad2_0101 = invmass0*invmass0 * r01sq; + double quad2_0102 = 2.0 * (invmass0+invmass2)*invmass0 * r0102; + + // iterate until converged + + double lamda01 = 0.0; + double lamda02 = 0.0; + int niter = 0; + int done = 0; + + double quad1,quad2,b1,b2,lamda01_new,lamda02_new; + + while (!done && niter < max_iter) { + quad1 = quad1_0101 * lamda01*lamda01 + quad1_0202 * lamda02*lamda02 + + quad1_0102 * lamda01*lamda02; + quad2 = quad2_0101 * lamda01*lamda01 + quad2_0202 * lamda02*lamda02 + + quad2_0102 * lamda01*lamda02; + + b1 = bond1*bond1 - s01sq - quad1; + b2 = bond2*bond2 - s02sq - quad2; + + lamda01_new = a11inv*b1 + a12inv*b2; + lamda02_new = a21inv*b1 + a22inv*b2; + + done = 1; + if (fabs(lamda01_new-lamda01) > tolerance) done = 0; + if (fabs(lamda02_new-lamda02) > tolerance) done = 0; + + lamda01 = lamda01_new; + lamda02 = lamda02_new; + niter++; + } + + // update forces if atom is owned by this processor + + lamda01 = lamda01/dtfsq; + lamda02 = lamda02/dtfsq; + + if (i0 < nlocal) { + f[i0][0] += lamda01*r01[0] + lamda02*r02[0]; + f[i0][1] += lamda01*r01[1] + lamda02*r02[1]; + f[i0][2] += lamda01*r01[2] + lamda02*r02[2]; + } + + if (i1 < nlocal) { + f[i1][0] -= lamda01*r01[0]; + f[i1][1] -= lamda01*r01[1]; + f[i1][2] -= lamda01*r01[2]; + } + + if (i2 < nlocal) { + f[i2][0] -= lamda02*r02[0]; + f[i2][1] -= lamda02*r02[1]; + f[i2][2] -= lamda02*r02[2]; + } + + if (evflag) { + nlist = 0; + if (i0 < nlocal) list[nlist++] = i0; + if (i1 < nlocal) list[nlist++] = i1; + if (i2 < nlocal) list[nlist++] = i2; + + v[0] = lamda01*r01[0]*r01[0] + lamda02*r02[0]*r02[0]; + v[1] = lamda01*r01[1]*r01[1] + lamda02*r02[1]*r02[1]; + v[2] = lamda01*r01[2]*r01[2] + lamda02*r02[2]*r02[2]; + v[3] = lamda01*r01[0]*r01[1] + lamda02*r02[0]*r02[1]; + v[4] = lamda01*r01[0]*r01[2] + lamda02*r02[0]*r02[2]; + v[5] = lamda01*r01[1]*r01[2] + lamda02*r02[1]*r02[2]; + + v_tally(nlist,list,3.0,v); + } +} + +/* ---------------------------------------------------------------------- */ + +void FixShakeCuda::shake4(int m) +{ + int nlist,list[4]; + double v[6]; + double invmass0,invmass1,invmass2,invmass3; + + // local atom IDs and constraint distances + + int i0 = atom->map(shake_atom[m][0]); + int i1 = atom->map(shake_atom[m][1]); + int i2 = atom->map(shake_atom[m][2]); + int i3 = atom->map(shake_atom[m][3]); + double bond1 = bond_distance[shake_type[m][0]]; + double bond2 = bond_distance[shake_type[m][1]]; + double bond3 = bond_distance[shake_type[m][2]]; + + // r01,r02,r03 = distance vec between atoms, with PBC + + double r01[3]; + r01[0] = x[i0][0] - x[i1][0]; + r01[1] = x[i0][1] - x[i1][1]; + r01[2] = x[i0][2] - x[i1][2]; + domain->minimum_image(r01); + + double r02[3]; + r02[0] = x[i0][0] - x[i2][0]; + r02[1] = x[i0][1] - x[i2][1]; + r02[2] = x[i0][2] - x[i2][2]; + domain->minimum_image(r02); + + double r03[3]; + r03[0] = x[i0][0] - x[i3][0]; + r03[1] = x[i0][1] - x[i3][1]; + r03[2] = x[i0][2] - x[i3][2]; + domain->minimum_image(r03); + + // s01,s02,s03 = distance vec after unconstrained update, with PBC + + double s01[3]; + s01[0] = xshake[i0][0] - xshake[i1][0]; + s01[1] = xshake[i0][1] - xshake[i1][1]; + s01[2] = xshake[i0][2] - xshake[i1][2]; + domain->minimum_image(s01); + + double s02[3]; + s02[0] = xshake[i0][0] - xshake[i2][0]; + s02[1] = xshake[i0][1] - xshake[i2][1]; + s02[2] = xshake[i0][2] - xshake[i2][2]; + domain->minimum_image(s02); + + double s03[3]; + s03[0] = xshake[i0][0] - xshake[i3][0]; + s03[1] = xshake[i0][1] - xshake[i3][1]; + s03[2] = xshake[i0][2] - xshake[i3][2]; + domain->minimum_image(s03); + + // scalar distances between atoms + + double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2]; + double r02sq = r02[0]*r02[0] + r02[1]*r02[1] + r02[2]*r02[2]; + double r03sq = r03[0]*r03[0] + r03[1]*r03[1] + r03[2]*r03[2]; + double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2]; + double s02sq = s02[0]*s02[0] + s02[1]*s02[1] + s02[2]*s02[2]; + double s03sq = s03[0]*s03[0] + s03[1]*s03[1] + s03[2]*s03[2]; + + // matrix coeffs and rhs for lamda equations + + if (rmass) { + invmass0 = 1.0/rmass[i0]; + invmass1 = 1.0/rmass[i1]; + invmass2 = 1.0/rmass[i2]; + invmass3 = 1.0/rmass[i3]; + } else { + invmass0 = 1.0/mass[type[i0]]; + invmass1 = 1.0/mass[type[i1]]; + invmass2 = 1.0/mass[type[i2]]; + invmass3 = 1.0/mass[type[i3]]; + } + + double a11 = 2.0 * (invmass0+invmass1) * + (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]); + double a12 = 2.0 * invmass0 * + (s01[0]*r02[0] + s01[1]*r02[1] + s01[2]*r02[2]); + double a13 = 2.0 * invmass0 * + (s01[0]*r03[0] + s01[1]*r03[1] + s01[2]*r03[2]); + double a21 = 2.0 * invmass0 * + (s02[0]*r01[0] + s02[1]*r01[1] + s02[2]*r01[2]); + double a22 = 2.0 * (invmass0+invmass2) * + (s02[0]*r02[0] + s02[1]*r02[1] + s02[2]*r02[2]); + double a23 = 2.0 * invmass0 * + (s02[0]*r03[0] + s02[1]*r03[1] + s02[2]*r03[2]); + double a31 = 2.0 * invmass0 * + (s03[0]*r01[0] + s03[1]*r01[1] + s03[2]*r01[2]); + double a32 = 2.0 * invmass0 * + (s03[0]*r02[0] + s03[1]*r02[1] + s03[2]*r02[2]); + double a33 = 2.0 * (invmass0+invmass3) * + (s03[0]*r03[0] + s03[1]*r03[1] + s03[2]*r03[2]); + + // inverse of matrix; + + double determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 - + a11*a23*a32 - a12*a21*a33 - a13*a22*a31; + if (determ == 0.0) error->one("Shake determinant = 0.0"); + double determinv = 1.0/determ; + + double a11inv = determinv * (a22*a33 - a23*a32); + double a12inv = -determinv * (a12*a33 - a13*a32); + double a13inv = determinv * (a12*a23 - a13*a22); + double a21inv = -determinv * (a21*a33 - a23*a31); + double a22inv = determinv * (a11*a33 - a13*a31); + double a23inv = -determinv * (a11*a23 - a13*a21); + double a31inv = determinv * (a21*a32 - a22*a31); + double a32inv = -determinv * (a11*a32 - a12*a31); + double a33inv = determinv * (a11*a22 - a12*a21); + + // quadratic correction coeffs + + double r0102 = (r01[0]*r02[0] + r01[1]*r02[1] + r01[2]*r02[2]); + double r0103 = (r01[0]*r03[0] + r01[1]*r03[1] + r01[2]*r03[2]); + double r0203 = (r02[0]*r03[0] + r02[1]*r03[1] + r02[2]*r03[2]); + + double quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; + double quad1_0202 = invmass0*invmass0 * r02sq; + double quad1_0303 = invmass0*invmass0 * r03sq; + double quad1_0102 = 2.0 * (invmass0+invmass1)*invmass0 * r0102; + double quad1_0103 = 2.0 * (invmass0+invmass1)*invmass0 * r0103; + double quad1_0203 = 2.0 * invmass0*invmass0 * r0203; + + double quad2_0101 = invmass0*invmass0 * r01sq; + double quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq; + double quad2_0303 = invmass0*invmass0 * r03sq; + double quad2_0102 = 2.0 * (invmass0+invmass2)*invmass0 * r0102; + double quad2_0103 = 2.0 * invmass0*invmass0 * r0103; + double quad2_0203 = 2.0 * (invmass0+invmass2)*invmass0 * r0203; + + double quad3_0101 = invmass0*invmass0 * r01sq; + double quad3_0202 = invmass0*invmass0 * r02sq; + double quad3_0303 = (invmass0+invmass3)*(invmass0+invmass3) * r03sq; + double quad3_0102 = 2.0 * invmass0*invmass0 * r0102; + double quad3_0103 = 2.0 * (invmass0+invmass3)*invmass0 * r0103; + double quad3_0203 = 2.0 * (invmass0+invmass3)*invmass0 * r0203; + + // iterate until converged + + double lamda01 = 0.0; + double lamda02 = 0.0; + double lamda03 = 0.0; + int niter = 0; + int done = 0; + + double quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda03_new; + + while (!done && niter < max_iter) { + quad1 = quad1_0101 * lamda01*lamda01 + + quad1_0202 * lamda02*lamda02 + + quad1_0303 * lamda03*lamda03 + + quad1_0102 * lamda01*lamda02 + + quad1_0103 * lamda01*lamda03 + + quad1_0203 * lamda02*lamda03; + + quad2 = quad2_0101 * lamda01*lamda01 + + quad2_0202 * lamda02*lamda02 + + quad2_0303 * lamda03*lamda03 + + quad2_0102 * lamda01*lamda02 + + quad2_0103 * lamda01*lamda03 + + quad2_0203 * lamda02*lamda03; + + quad3 = quad3_0101 * lamda01*lamda01 + + quad3_0202 * lamda02*lamda02 + + quad3_0303 * lamda03*lamda03 + + quad3_0102 * lamda01*lamda02 + + quad3_0103 * lamda01*lamda03 + + quad3_0203 * lamda02*lamda03; + + b1 = bond1*bond1 - s01sq - quad1; + b2 = bond2*bond2 - s02sq - quad2; + b3 = bond3*bond3 - s03sq - quad3; + + lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3; + lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3; + lamda03_new = a31inv*b1 + a32inv*b2 + a33inv*b3; + + done = 1; + if (fabs(lamda01_new-lamda01) > tolerance) done = 0; + if (fabs(lamda02_new-lamda02) > tolerance) done = 0; + if (fabs(lamda03_new-lamda03) > tolerance) done = 0; + + lamda01 = lamda01_new; + lamda02 = lamda02_new; + lamda03 = lamda03_new; + niter++; + } + + // update forces if atom is owned by this processor + + lamda01 = lamda01/dtfsq; + lamda02 = lamda02/dtfsq; + lamda03 = lamda03/dtfsq; + + if (i0 < nlocal) { + f[i0][0] += lamda01*r01[0] + lamda02*r02[0] + lamda03*r03[0]; + f[i0][1] += lamda01*r01[1] + lamda02*r02[1] + lamda03*r03[1]; + f[i0][2] += lamda01*r01[2] + lamda02*r02[2] + lamda03*r03[2]; + } + + if (i1 < nlocal) { + f[i1][0] -= lamda01*r01[0]; + f[i1][1] -= lamda01*r01[1]; + f[i1][2] -= lamda01*r01[2]; + } + + if (i2 < nlocal) { + f[i2][0] -= lamda02*r02[0]; + f[i2][1] -= lamda02*r02[1]; + f[i2][2] -= lamda02*r02[2]; + } + + if (i3 < nlocal) { + f[i3][0] -= lamda03*r03[0]; + f[i3][1] -= lamda03*r03[1]; + f[i3][2] -= lamda03*r03[2]; + } + + if (evflag) { + nlist = 0; + if (i0 < nlocal) list[nlist++] = i0; + if (i1 < nlocal) list[nlist++] = i1; + if (i2 < nlocal) list[nlist++] = i2; + if (i3 < nlocal) list[nlist++] = i3; + + v[0] = lamda01*r01[0]*r01[0]+lamda02*r02[0]*r02[0]+lamda03*r03[0]*r03[0]; + v[1] = lamda01*r01[1]*r01[1]+lamda02*r02[1]*r02[1]+lamda03*r03[1]*r03[1]; + v[2] = lamda01*r01[2]*r01[2]+lamda02*r02[2]*r02[2]+lamda03*r03[2]*r03[2]; + v[3] = lamda01*r01[0]*r01[1]+lamda02*r02[0]*r02[1]+lamda03*r03[0]*r03[1]; + v[4] = lamda01*r01[0]*r01[2]+lamda02*r02[0]*r02[2]+lamda03*r03[0]*r03[2]; + v[5] = lamda01*r01[1]*r01[2]+lamda02*r02[1]*r02[2]+lamda03*r03[1]*r03[2]; +//if(i0==7271) printf("%lf %lf %lf %lf %lf %lf\n",v[0],v[1],v[2],v[3],v[4],v[5]); + + v_tally(nlist,list,4.0,v); + } +} + +/* ---------------------------------------------------------------------- */ + +void FixShakeCuda::shake3angle(int m) +{ + int nlist,list[3]; + double v[6]; + double invmass0,invmass1,invmass2; + + // local atom IDs and constraint distances + + int i0 = atom->map(shake_atom[m][0]); + int i1 = atom->map(shake_atom[m][1]); + int i2 = atom->map(shake_atom[m][2]); + double bond1 = bond_distance[shake_type[m][0]]; + double bond2 = bond_distance[shake_type[m][1]]; + double bond12 = angle_distance[shake_type[m][2]]; + + // r01,r02,r12 = distance vec between atoms, with PBC + + double r01[3]; + r01[0] = x[i0][0] - x[i1][0]; + r01[1] = x[i0][1] - x[i1][1]; + r01[2] = x[i0][2] - x[i1][2]; + domain->minimum_image(r01); + + double r02[3]; + r02[0] = x[i0][0] - x[i2][0]; + r02[1] = x[i0][1] - x[i2][1]; + r02[2] = x[i0][2] - x[i2][2]; + domain->minimum_image(r02); + + double r12[3]; + r12[0] = x[i1][0] - x[i2][0]; + r12[1] = x[i1][1] - x[i2][1]; + r12[2] = x[i1][2] - x[i2][2]; + domain->minimum_image(r12); + + // s01,s02,s12 = distance vec after unconstrained update, with PBC + + double s01[3]; + s01[0] = xshake[i0][0] - xshake[i1][0]; + s01[1] = xshake[i0][1] - xshake[i1][1]; + s01[2] = xshake[i0][2] - xshake[i1][2]; + domain->minimum_image(s01); + + double s02[3]; + s02[0] = xshake[i0][0] - xshake[i2][0]; + s02[1] = xshake[i0][1] - xshake[i2][1]; + s02[2] = xshake[i0][2] - xshake[i2][2]; + domain->minimum_image(s02); + + double s12[3]; + s12[0] = xshake[i1][0] - xshake[i2][0]; + s12[1] = xshake[i1][1] - xshake[i2][1]; + s12[2] = xshake[i1][2] - xshake[i2][2]; + domain->minimum_image(s12); + + // scalar distances between atoms + + double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2]; + double r02sq = r02[0]*r02[0] + r02[1]*r02[1] + r02[2]*r02[2]; + double r12sq = r12[0]*r12[0] + r12[1]*r12[1] + r12[2]*r12[2]; + double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2]; + double s02sq = s02[0]*s02[0] + s02[1]*s02[1] + s02[2]*s02[2]; + double s12sq = s12[0]*s12[0] + s12[1]*s12[1] + s12[2]*s12[2]; + + // matrix coeffs and rhs for lamda equations + + if (rmass) { + invmass0 = 1.0/rmass[i0]; + invmass1 = 1.0/rmass[i1]; + invmass2 = 1.0/rmass[i2]; + } else { + invmass0 = 1.0/mass[type[i0]]; + invmass1 = 1.0/mass[type[i1]]; + invmass2 = 1.0/mass[type[i2]]; + } + + double a11 = 2.0 * (invmass0+invmass1) * + (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]); + double a12 = 2.0 * invmass0 * + (s01[0]*r02[0] + s01[1]*r02[1] + s01[2]*r02[2]); + double a13 = - 2.0 * invmass1 * + (s01[0]*r12[0] + s01[1]*r12[1] + s01[2]*r12[2]); + double a21 = 2.0 * invmass0 * + (s02[0]*r01[0] + s02[1]*r01[1] + s02[2]*r01[2]); + double a22 = 2.0 * (invmass0+invmass2) * + (s02[0]*r02[0] + s02[1]*r02[1] + s02[2]*r02[2]); + double a23 = 2.0 * invmass2 * + (s02[0]*r12[0] + s02[1]*r12[1] + s02[2]*r12[2]); + double a31 = - 2.0 * invmass1 * + (s12[0]*r01[0] + s12[1]*r01[1] + s12[2]*r01[2]); + double a32 = 2.0 * invmass2 * + (s12[0]*r02[0] + s12[1]*r02[1] + s12[2]*r02[2]); + double a33 = 2.0 * (invmass1+invmass2) * + (s12[0]*r12[0] + s12[1]*r12[1] + s12[2]*r12[2]); + + // inverse of matrix + + double determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 - + a11*a23*a32 - a12*a21*a33 - a13*a22*a31; + if (determ == 0.0) error->one("Shake determinant = 0.0"); + double determinv = 1.0/determ; + + double a11inv = determinv * (a22*a33 - a23*a32); + double a12inv = -determinv * (a12*a33 - a13*a32); + double a13inv = determinv * (a12*a23 - a13*a22); + double a21inv = -determinv * (a21*a33 - a23*a31); + double a22inv = determinv * (a11*a33 - a13*a31); + double a23inv = -determinv * (a11*a23 - a13*a21); + double a31inv = determinv * (a21*a32 - a22*a31); + double a32inv = -determinv * (a11*a32 - a12*a31); + double a33inv = determinv * (a11*a22 - a12*a21); + + // quadratic correction coeffs + + double r0102 = (r01[0]*r02[0] + r01[1]*r02[1] + r01[2]*r02[2]); + double r0112 = (r01[0]*r12[0] + r01[1]*r12[1] + r01[2]*r12[2]); + double r0212 = (r02[0]*r12[0] + r02[1]*r12[1] + r02[2]*r12[2]); + + double quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; + double quad1_0202 = invmass0*invmass0 * r02sq; + double quad1_1212 = invmass1*invmass1 * r12sq; + double quad1_0102 = 2.0 * (invmass0+invmass1)*invmass0 * r0102; + double quad1_0112 = - 2.0 * (invmass0+invmass1)*invmass1 * r0112; + double quad1_0212 = - 2.0 * invmass0*invmass1 * r0212; + + double quad2_0101 = invmass0*invmass0 * r01sq; + double quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq; + double quad2_1212 = invmass2*invmass2 * r12sq; + double quad2_0102 = 2.0 * (invmass0+invmass2)*invmass0 * r0102; + double quad2_0112 = 2.0 * invmass0*invmass2 * r0112; + double quad2_0212 = 2.0 * (invmass0+invmass2)*invmass2 * r0212; + + double quad3_0101 = invmass1*invmass1 * r01sq; + double quad3_0202 = invmass2*invmass2 * r02sq; + double quad3_1212 = (invmass1+invmass2)*(invmass1+invmass2) * r12sq; + double quad3_0102 = - 2.0 * invmass1*invmass2 * r0102; + double quad3_0112 = - 2.0 * (invmass1+invmass2)*invmass1 * r0112; + double quad3_0212 = 2.0 * (invmass1+invmass2)*invmass2 * r0212; + + // iterate until converged + + double lamda01 = 0.0; + double lamda02 = 0.0; + double lamda12 = 0.0; + int niter = 0; + int done = 0; + + double quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda12_new; + + while (!done && niter < max_iter) { + quad1 = quad1_0101 * lamda01*lamda01 + + quad1_0202 * lamda02*lamda02 + + quad1_1212 * lamda12*lamda12 + + quad1_0102 * lamda01*lamda02 + + quad1_0112 * lamda01*lamda12 + + quad1_0212 * lamda02*lamda12; + + quad2 = quad2_0101 * lamda01*lamda01 + + quad2_0202 * lamda02*lamda02 + + quad2_1212 * lamda12*lamda12 + + quad2_0102 * lamda01*lamda02 + + quad2_0112 * lamda01*lamda12 + + quad2_0212 * lamda02*lamda12; + + quad3 = quad3_0101 * lamda01*lamda01 + + quad3_0202 * lamda02*lamda02 + + quad3_1212 * lamda12*lamda12 + + quad3_0102 * lamda01*lamda02 + + quad3_0112 * lamda01*lamda12 + + quad3_0212 * lamda02*lamda12; + + b1 = bond1*bond1 - s01sq - quad1; + b2 = bond2*bond2 - s02sq - quad2; + b3 = bond12*bond12 - s12sq - quad3; + + lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3; + lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3; + lamda12_new = a31inv*b1 + a32inv*b2 + a33inv*b3; + + done = 1; + if (fabs(lamda01_new-lamda01) > tolerance) done = 0; + if (fabs(lamda02_new-lamda02) > tolerance) done = 0; + if (fabs(lamda12_new-lamda12) > tolerance) done = 0; + + lamda01 = lamda01_new; + lamda02 = lamda02_new; + lamda12 = lamda12_new; + niter++; + } + + // update forces if atom is owned by this processor + + lamda01 = lamda01/dtfsq; + lamda02 = lamda02/dtfsq; + lamda12 = lamda12/dtfsq; + + if (i0 < nlocal) { + f[i0][0] += lamda01*r01[0] + lamda02*r02[0]; + f[i0][1] += lamda01*r01[1] + lamda02*r02[1]; + f[i0][2] += lamda01*r01[2] + lamda02*r02[2]; + } + + if (i1 < nlocal) { + f[i1][0] -= lamda01*r01[0] - lamda12*r12[0]; + f[i1][1] -= lamda01*r01[1] - lamda12*r12[1]; + f[i1][2] -= lamda01*r01[2] - lamda12*r12[2]; + } + + if (i2 < nlocal) { + f[i2][0] -= lamda02*r02[0] + lamda12*r12[0]; + f[i2][1] -= lamda02*r02[1] + lamda12*r12[1]; + f[i2][2] -= lamda02*r02[2] + lamda12*r12[2]; + } + + if (evflag) { + nlist = 0; + if (i0 < nlocal) list[nlist++] = i0; + if (i1 < nlocal) list[nlist++] = i1; + if (i2 < nlocal) list[nlist++] = i2; + + v[0] = lamda01*r01[0]*r01[0]+lamda02*r02[0]*r02[0]+lamda12*r12[0]*r12[0]; + v[1] = lamda01*r01[1]*r01[1]+lamda02*r02[1]*r02[1]+lamda12*r12[1]*r12[1]; + v[2] = lamda01*r01[2]*r01[2]+lamda02*r02[2]*r02[2]+lamda12*r12[2]*r12[2]; + v[3] = lamda01*r01[0]*r01[1]+lamda02*r02[0]*r02[1]+lamda12*r12[0]*r12[1]; + v[4] = lamda01*r01[0]*r01[2]+lamda02*r02[0]*r02[2]+lamda12*r12[0]*r12[2]; + v[5] = lamda01*r01[1]*r01[2]+lamda02*r02[1]*r02[2]+lamda12*r12[1]*r12[2]; + + v_tally(nlist,list,3.0,v); + } +} + +/* ---------------------------------------------------------------------- + print-out bond & angle statistics +------------------------------------------------------------------------- */ + +void FixShakeCuda::stats() +{ + int i,j,m,n,iatom,jatom,katom; + double delx,dely,delz; + double r,r1,r2,r3,angle; + + // zero out accumulators + + int nb = atom->nbondtypes + 1; + int na = atom->nangletypes + 1; + + for (i = 0; i < nb; i++) { + b_count[i] = 0; + b_ave[i] = b_max[i] = 0.0; + b_min[i] = BIG; + } + for (i = 0; i < na; i++) { + a_count[i] = 0; + a_ave[i] = a_max[i] = 0.0; + a_min[i] = BIG; + } + + // log stats for each bond & angle + // OK to double count since are just averaging + + double **x = atom->x; + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++) { + if (shake_flag[i] == 0) continue; + + // bond stats + + n = shake_flag[i]; + if (n == 1) n = 3; + iatom = atom->map(shake_atom[i][0]); + for (j = 1; j < n; j++) { + jatom = atom->map(shake_atom[i][j]); + delx = x[iatom][0] - x[jatom][0]; + dely = x[iatom][1] - x[jatom][1]; + delz = x[iatom][2] - x[jatom][2]; + domain->minimum_image(delx,dely,delz); + r = sqrt(delx*delx + dely*dely + delz*delz); + + m = shake_type[i][j-1]; + b_count[m]++; + b_ave[m] += r; + b_max[m] = MAX(b_max[m],r); + b_min[m] = MIN(b_min[m],r); + } + + // angle stats + + if (shake_flag[i] == 1) { + iatom = atom->map(shake_atom[i][0]); + jatom = atom->map(shake_atom[i][1]); + katom = atom->map(shake_atom[i][2]); + + delx = x[iatom][0] - x[jatom][0]; + dely = x[iatom][1] - x[jatom][1]; + delz = x[iatom][2] - x[jatom][2]; + domain->minimum_image(delx,dely,delz); + r1 = sqrt(delx*delx + dely*dely + delz*delz); + + delx = x[iatom][0] - x[katom][0]; + dely = x[iatom][1] - x[katom][1]; + delz = x[iatom][2] - x[katom][2]; + domain->minimum_image(delx,dely,delz); + r2 = sqrt(delx*delx + dely*dely + delz*delz); + + delx = x[jatom][0] - x[katom][0]; + dely = x[jatom][1] - x[katom][1]; + delz = x[jatom][2] - x[katom][2]; + domain->minimum_image(delx,dely,delz); + r3 = sqrt(delx*delx + dely*dely + delz*delz); + + angle = acos((r1*r1 + r2*r2 - r3*r3) / (2.0*r1*r2)); + angle *= 180.0/PI; + m = shake_type[i][2]; + a_count[m]++; + a_ave[m] += angle; + a_max[m] = MAX(a_max[m],angle); + a_min[m] = MIN(a_min[m],angle); + } + } + + // sum across all procs + + MPI_Allreduce(b_count,b_count_all,nb,MPI_INT,MPI_SUM,world); + MPI_Allreduce(b_ave,b_ave_all,nb,MPI_DOUBLE,MPI_SUM,world); + MPI_Allreduce(b_max,b_max_all,nb,MPI_DOUBLE,MPI_MAX,world); + MPI_Allreduce(b_min,b_min_all,nb,MPI_DOUBLE,MPI_MIN,world); + + MPI_Allreduce(a_count,a_count_all,na,MPI_INT,MPI_SUM,world); + MPI_Allreduce(a_ave,a_ave_all,na,MPI_DOUBLE,MPI_SUM,world); + MPI_Allreduce(a_max,a_max_all,na,MPI_DOUBLE,MPI_MAX,world); + MPI_Allreduce(a_min,a_min_all,na,MPI_DOUBLE,MPI_MIN,world); + + // print stats only for non-zero counts + + if (me == 0) { + if (screen) { + fprintf(screen, + "SHAKE stats (type/ave/delta) on step " BIGINT_FORMAT "\n", + update->ntimestep); + for (i = 1; i < nb; i++) + if (b_count_all[i]) + fprintf(screen," %d %g %g\n",i, + b_ave_all[i]/b_count_all[i],b_max_all[i]-b_min_all[i]); + for (i = 1; i < na; i++) + if (a_count_all[i]) + fprintf(screen," %d %g %g\n",i, + a_ave_all[i]/a_count_all[i],a_max_all[i]-a_min_all[i]); + } + if (logfile) { + fprintf(logfile, + "SHAKE stats (type/ave/delta) on step " BIGINT_FORMAT "\n", + update->ntimestep); + for (i = 0; i < nb; i++) + if (b_count_all[i]) + fprintf(logfile," %d %g %g\n",i, + b_ave_all[i]/b_count_all[i],b_max_all[i]-b_min_all[i]); + for (i = 0; i < na; i++) + if (a_count_all[i]) + fprintf(logfile," %d %g %g\n",i, + a_ave_all[i]/a_count_all[i],a_max_all[i]-a_min_all[i]); + } + } + + // next timestep for stats + + next_output += output_every; +} + +/* ---------------------------------------------------------------------- + find a bond between global tags n1 and n2 stored with local atom i + return -1 if don't find it + return bond index if do find it +------------------------------------------------------------------------- */ + +int FixShakeCuda::bondfind(int i, int n1, int n2) +{ + int *tag = atom->tag; + int **bond_atom = atom->bond_atom; + int nbonds = atom->num_bond[i]; + + int m; + for (m = 0; m < nbonds; m++) { + if (n1 == tag[i] && n2 == bond_atom[i][m]) break; + if (n1 == bond_atom[i][m] && n2 == tag[i]) break; + } + if (m < nbonds) return m; + return -1; +} + +/* ---------------------------------------------------------------------- + find an angle with global end atoms n1 and n2 stored with local atom i + return -1 if don't find it + return angle index if do find it +------------------------------------------------------------------------- */ + +int FixShakeCuda::anglefind(int i, int n1, int n2) +{ + int **angle_atom1 = atom->angle_atom1; + int **angle_atom3 = atom->angle_atom3; + int nangles = atom->num_angle[i]; + + int m; + for (m = 0; m < nangles; m++) { + if (n1 == angle_atom1[i][m] && n2 == angle_atom3[i][m]) break; + if (n1 == angle_atom3[i][m] && n2 == angle_atom1[i][m]) break; + } + if (m < nangles) return m; + return -1; +} + +/* ---------------------------------------------------------------------- + memory usage of local atom-based arrays +------------------------------------------------------------------------- */ + +double FixShakeCuda::memory_usage() +{ + int nmax = atom->nmax; + double bytes = nmax * sizeof(int); + bytes += nmax*4 * sizeof(int); + bytes += nmax*3 * sizeof(int); + bytes += nmax*3 * sizeof(double); + bytes += maxvatom*6 * sizeof(double); + return bytes; +} + +/* ---------------------------------------------------------------------- + allocate local atom-based arrays +------------------------------------------------------------------------- */ + +void FixShakeCuda::grow_arrays(int nmax) +{ + memory->grow(shake_flag,nmax,"shake:shake_flag"); + memory->grow(shake_atom,nmax,4,"shake:shake_atom"); + memory->grow(shake_type,nmax,3,"shake:shake_type"); + memory->destroy(xshake); + memory->create(xshake,nmax,3,"shake:xshake"); + + delete cu_shake_flag; cu_shake_flag = new cCudaData (shake_flag, nmax ); + delete cu_shake_atom; cu_shake_atom = new cCudaData ((int*)shake_atom, nmax, 4); + delete cu_shake_type; cu_shake_type = new cCudaData ((int*)shake_type, nmax, 3); + delete cu_xshake; cu_xshake = new cCudaData ((double*)xshake, nmax, 3); + cu_shake_flag->upload(); + cu_shake_atom->upload(); + cu_shake_type->upload(); + if(cu_bond_distance) + Cuda_FixShakeCuda_Init(&cuda->shared_data,dtv, dtfsq, + cu_shake_flag->dev_data(),cu_shake_atom->dev_data(),cu_shake_type->dev_data(), cu_xshake->dev_data(), + cu_bond_distance->dev_data(),cu_angle_distance->dev_data(),cu_virial->dev_data(), + max_iter,tolerance); +} + +/* ---------------------------------------------------------------------- + copy values within local atom-based arrays +------------------------------------------------------------------------- */ + +void FixShakeCuda::copy_arrays(int i, int j) +{ + int flag = shake_flag[j] = shake_flag[i]; + if (flag == 1) { + shake_atom[j][0] = shake_atom[i][0]; + shake_atom[j][1] = shake_atom[i][1]; + shake_atom[j][2] = shake_atom[i][2]; + shake_type[j][0] = shake_type[i][0]; + shake_type[j][1] = shake_type[i][1]; + shake_type[j][2] = shake_type[i][2]; + } else if (flag == 2) { + shake_atom[j][0] = shake_atom[i][0]; + shake_atom[j][1] = shake_atom[i][1]; + shake_type[j][0] = shake_type[i][0]; + } else if (flag == 3) { + shake_atom[j][0] = shake_atom[i][0]; + shake_atom[j][1] = shake_atom[i][1]; + shake_atom[j][2] = shake_atom[i][2]; + shake_type[j][0] = shake_type[i][0]; + shake_type[j][1] = shake_type[i][1]; + } else if (flag == 4) { + shake_atom[j][0] = shake_atom[i][0]; + shake_atom[j][1] = shake_atom[i][1]; + shake_atom[j][2] = shake_atom[i][2]; + shake_atom[j][3] = shake_atom[i][3]; + shake_type[j][0] = shake_type[i][0]; + shake_type[j][1] = shake_type[i][1]; + shake_type[j][2] = shake_type[i][2]; + } +} + +/* ---------------------------------------------------------------------- + initialize one atom's array values, called when atom is created +------------------------------------------------------------------------- */ + +void FixShakeCuda::set_arrays(int i) +{ + shake_flag[i] = 0; +} + +/* ---------------------------------------------------------------------- + pack values in local atom-based arrays for exchange with another proc +------------------------------------------------------------------------- */ + +int FixShakeCuda::pack_exchange(int i, double *buf) +{ + int m = 0; + buf[m++] = shake_flag[i]; + int flag = shake_flag[i]; + if (flag == 1) { + buf[m++] = shake_atom[i][0]; + buf[m++] = shake_atom[i][1]; + buf[m++] = shake_atom[i][2]; + buf[m++] = shake_type[i][0]; + buf[m++] = shake_type[i][1]; + buf[m++] = shake_type[i][2]; + } else if (flag == 2) { + buf[m++] = shake_atom[i][0]; + buf[m++] = shake_atom[i][1]; + buf[m++] = shake_type[i][0]; + } else if (flag == 3) { + buf[m++] = shake_atom[i][0]; + buf[m++] = shake_atom[i][1]; + buf[m++] = shake_atom[i][2]; + buf[m++] = shake_type[i][0]; + buf[m++] = shake_type[i][1]; + } else if (flag == 4) { + buf[m++] = shake_atom[i][0]; + buf[m++] = shake_atom[i][1]; + buf[m++] = shake_atom[i][2]; + buf[m++] = shake_atom[i][3]; + buf[m++] = shake_type[i][0]; + buf[m++] = shake_type[i][1]; + buf[m++] = shake_type[i][2]; + } + return m; +} + +/* ---------------------------------------------------------------------- + unpack values in local atom-based arrays from exchange with another proc +------------------------------------------------------------------------- */ + +int FixShakeCuda::unpack_exchange(int nlocal, double *buf) +{ + int m = 0; + int flag = shake_flag[nlocal] = static_cast (buf[m++]); + if (flag == 1) { + shake_atom[nlocal][0] = static_cast (buf[m++]); + shake_atom[nlocal][1] = static_cast (buf[m++]); + shake_atom[nlocal][2] = static_cast (buf[m++]); + shake_type[nlocal][0] = static_cast (buf[m++]); + shake_type[nlocal][1] = static_cast (buf[m++]); + shake_type[nlocal][2] = static_cast (buf[m++]); + } else if (flag == 2) { + shake_atom[nlocal][0] = static_cast (buf[m++]); + shake_atom[nlocal][1] = static_cast (buf[m++]); + shake_type[nlocal][0] = static_cast (buf[m++]); + } else if (flag == 3) { + shake_atom[nlocal][0] = static_cast (buf[m++]); + shake_atom[nlocal][1] = static_cast (buf[m++]); + shake_atom[nlocal][2] = static_cast (buf[m++]); + shake_type[nlocal][0] = static_cast (buf[m++]); + shake_type[nlocal][1] = static_cast (buf[m++]); + } else if (flag == 4) { + shake_atom[nlocal][0] = static_cast (buf[m++]); + shake_atom[nlocal][1] = static_cast (buf[m++]); + shake_atom[nlocal][2] = static_cast (buf[m++]); + shake_atom[nlocal][3] = static_cast (buf[m++]); + shake_type[nlocal][0] = static_cast (buf[m++]); + shake_type[nlocal][1] = static_cast (buf[m++]); + shake_type[nlocal][2] = static_cast (buf[m++]); + } + return m; +} + +/* ---------------------------------------------------------------------- + enforce SHAKE constraints from rRESPA + prediction portion is different than Verlet + rRESPA updating of atom coords is done with full v, but only portions of f +------------------------------------------------------------------------- */ +/* +void FixShakeCuda::post_force_respa(int vflag, int ilevel, int iloop) +{ + // call stats only on outermost level + + if (ilevel == nlevels_respa-1 && update->ntimestep == next_output) stats(); + + // perform SHAKE on every loop iteration of every rRESPA level + // except last loop iteration of inner levels + + if (ilevel < nlevels_respa-1 && iloop == loop_respa[ilevel]-1) return; + + // xshake = atom coords after next x update in innermost loop + // depends on rRESPA level + // for levels > 0 this includes more than one velocity update + // xshake = predicted position from call to this routine at level N = + // x + dt0 (v + dtN/m fN + 1/2 dt(N-1)/m f(N-1) + ... + 1/2 dt0/m f0) + + double ***f_level = ((FixRespa *) modify->fix[ifix_respa])->f_level; + dtfsq = dtf_inner * step_respa[ilevel]; + + double invmass,dtfmsq; + int jlevel; + + if (rmass) { + for (int i = 0; i < nlocal; i++) { + if (shake_flag[i]) { + invmass = 1.0 / rmass[i]; + dtfmsq = dtfsq * invmass; + xshake[i][0] = x[i][0] + dtv*v[i][0] + dtfmsq*f[i][0]; + xshake[i][1] = x[i][1] + dtv*v[i][1] + dtfmsq*f[i][1]; + xshake[i][2] = x[i][2] + dtv*v[i][2] + dtfmsq*f[i][2]; + for (jlevel = 0; jlevel < ilevel; jlevel++) { + dtfmsq = dtf_innerhalf * step_respa[jlevel] * invmass; + xshake[i][0] += dtfmsq*f_level[i][jlevel][0]; + xshake[i][1] += dtfmsq*f_level[i][jlevel][1]; + xshake[i][2] += dtfmsq*f_level[i][jlevel][2]; + } + } else xshake[i][2] = xshake[i][1] = xshake[i][0] = 0.0; + } + + } else { + for (int i = 0; i < nlocal; i++) { + if (shake_flag[i]) { + invmass = 1.0 / mass[type[i]]; + dtfmsq = dtfsq * invmass; + xshake[i][0] = x[i][0] + dtv*v[i][0] + dtfmsq*f[i][0]; + xshake[i][1] = x[i][1] + dtv*v[i][1] + dtfmsq*f[i][1]; + xshake[i][2] = x[i][2] + dtv*v[i][2] + dtfmsq*f[i][2]; + for (jlevel = 0; jlevel < ilevel; jlevel++) { + dtfmsq = dtf_innerhalf * step_respa[jlevel] * invmass; + xshake[i][0] += dtfmsq*f_level[i][jlevel][0]; + xshake[i][1] += dtfmsq*f_level[i][jlevel][1]; + xshake[i][2] += dtfmsq*f_level[i][jlevel][2]; + } + } else xshake[i][2] = xshake[i][1] = xshake[i][0] = 0.0; + } + } + + // communicate results if necessary + + if (nprocs > 1) comm->forward_comm_fix(this); + + // virial setup + + if (vflag) v_setup(vflag); + else evflag = 0; + + // loop over clusters + + int m; + for (int i = 0; i < nlist; i++) { + m = list[i]; + if (shake_flag[m] == 2) shake2(m); + else if (shake_flag[m] == 3) shake3(m); + else if (shake_flag[m] == 4) shake4(m); + else shake3angle(m); + } +} + +/* ---------------------------------------------------------------------- */ + +int FixShakeCuda::pack_comm(int n, int *list, double *buf, int pbc_flag, int *pbc) +{ + if(cuda->finished_setup) + { + int iswap=*list; + if(iswap<0) + { + iswap=-iswap-1; + int first= ((int*) buf)[0]; + Cuda_FixShakeCuda_PackComm_Self(&cuda->shared_data,n,iswap,first,pbc,pbc_flag); + } + else + Cuda_FixShakeCuda_PackComm(&cuda->shared_data,n,iswap,(void*) buf,pbc,pbc_flag); + return 3; + } + + int i,j,m; + double dx,dy,dz; + + m = 0; + if (pbc_flag == 0) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = xshake[j][0]; + buf[m++] = xshake[j][1]; + buf[m++] = xshake[j][2]; + } + } else { + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; + dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; + dz = pbc[2]*domain->zprd; + } + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = xshake[j][0] + dx; + buf[m++] = xshake[j][1] + dy; + buf[m++] = xshake[j][2] + dz; + } + } + return 3; +} + +/* ---------------------------------------------------------------------- */ + +void FixShakeCuda::unpack_comm(int n, int first, double *buf) +{ + if(cuda->finished_setup) + { + Cuda_FixShakeCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf); + return; + } + + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + xshake[i][0] = buf[m++]; + xshake[i][1] = buf[m++]; + xshake[i][2] = buf[m++]; + } +} + +/* ---------------------------------------------------------------------- */ + +void FixShakeCuda::reset_dt() +{ + if (strcmp(update->integrate_style,"verlet") == 0) { + dtv = update->dt; + dtfsq = update->dt * update->dt * force->ftm2v; + } else { + dtv = step_respa[0]; + dtf_innerhalf = 0.5 * step_respa[0] * force->ftm2v; + dtf_inner = step_respa[0] * force->ftm2v; + } + if(cu_shake_atom) + Cuda_FixShakeCuda_Init(&cuda->shared_data,dtv, dtfsq, + cu_shake_flag->dev_data(),cu_shake_atom->dev_data(),cu_shake_type->dev_data(), cu_xshake->dev_data(), + cu_bond_distance->dev_data(),cu_angle_distance->dev_data(),cu_virial->dev_data(), + max_iter,tolerance); +} diff --git a/src/USER-CUDA/fix_shake_cuda.h b/src/USER-CUDA/fix_shake_cuda.h new file mode 100644 index 0000000000..18ea64f983 --- /dev/null +++ b/src/USER-CUDA/fix_shake_cuda.h @@ -0,0 +1,133 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(shake/cuda,FixShakeCuda) + +#else + +#ifndef LMP_FIX_SHAKE_CUDA_H +#define LMP_FIX_SHAKE_CUDA_H + +#include "fix.h" +#include "cuda_data.h" +#include "cuda_precision.h" + +namespace LAMMPS_NS { + +class FixShakeCuda : public Fix { + public: + FixShakeCuda(class LAMMPS *, int, char **); + ~FixShakeCuda(); + int setmask(); + void init(); + void setup(int); + void pre_neighbor(); + void post_force(int); + //void post_force_respa(int, int, int); + + double memory_usage(); + void grow_arrays(int); + void copy_arrays(int, int); + void set_arrays(int); + int pack_exchange(int, double *); + int unpack_exchange(int, double *); + int pack_comm(int, int *, double *, int, int *); + void unpack_comm(int, int, double *); + + int dof(int); + void reset_dt(); + + double time_postforce; + private: + class Cuda *cuda; + int me,nprocs; + double PI; + double tolerance; // SHAKE tolerance + int max_iter; // max # of SHAKE iterations + int output_every; // SHAKE stat output every so often + int next_output; // timestep for next output + + // settings from input command + int *bond_flag,*angle_flag; // bond/angle types to constrain + int *type_flag; // constrain bonds to these types + double *mass_list; // constrain bonds to these masses + int nmass; // # of masses in mass_list + bool neighbor_step; // was neighboring done in this step -> need to run the Cuda_FixShake_Init + + double *bond_distance,*angle_distance; // constraint distances + cCudaData* cu_bond_distance; + cCudaData* cu_angle_distance; + + int ifix_respa; // rRESPA fix needed by SHAKE + int nlevels_respa; // copies of needed rRESPA variables + int *loop_respa; + double *step_respa; + + double **x,**v,**f; // local ptrs to atom class quantities + double *mass,*rmass; + int *type; + int nlocal; + // atom-based arrays + int *shake_flag; // 0 if atom not in SHAKE cluster + // 1 = size 3 angle cluster + // 2,3,4 = size of bond-only cluster + int **shake_atom; // global IDs of atoms in cluster + // central atom is 1st + // lowest global ID is 1st for size 2 + + int **shake_type; // bondtype of each bond in cluster + // for angle cluster, 3rd value + // is angletype + double **xshake; // unconstrained atom coords + cCudaData* cu_shake_flag; + cCudaData* cu_shake_atom; + cCudaData* cu_shake_type; + cCudaData* cu_xshake; + cCudaData* cu_list; + cCudaData* cu_virial; + int* countoccur; + + int vflag; // virial flag + double dtv,dtfsq; // timesteps for trial move + double dtf_inner,dtf_innerhalf; // timesteps for rRESPA trial move + + int *list; // list of clusters to SHAKE + int nlist,maxlist; // size and max-size of list + + // stat quantities + int *b_count,*b_count_all; // counts for each bond type + double *b_ave,*b_max,*b_min; // ave/max/min dist for each bond type + double *b_ave_all,*b_max_all,*b_min_all; // MPI summing arrays + int *a_count,*a_count_all; // ditto for angle types + double *a_ave,*a_max,*a_min; + double *a_ave_all,*a_max_all,*a_min_all; + + void find_clusters(); + void swap_clusters(int i,int j); + int masscheck(double); + void unconstrained_update(); + void shake2(int); + void shake3(int); + void shake4(int); + void shake3angle(int); + void stats(); + int bondfind(int, int, int); + int anglefind(int, int, int); +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_temp_berendsen_cuda.cpp b/src/USER-CUDA/fix_temp_berendsen_cuda.cpp new file mode 100644 index 0000000000..2c9853c7c2 --- /dev/null +++ b/src/USER-CUDA/fix_temp_berendsen_cuda.cpp @@ -0,0 +1,220 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "fix_temp_berendsen_cuda.h" +#include "fix_temp_berendsen_cuda_cu.h" +#include "atom.h" +#include "force.h" +#include "group.h" +#include "update.h" +#include "comm.h" +#include "modify.h" +#include "compute.h" +#include "error.h" +#include "cuda.h" +#include "cuda_modify_flags.h" + +using namespace LAMMPS_NS; + +enum{NOBIAS,BIAS}; + +/* ---------------------------------------------------------------------- */ + +FixTempBerendsenCuda::FixTempBerendsenCuda(LAMMPS *lmp, int narg, char **arg) : + Fix(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (narg != 6) error->all("Illegal fix temp/berendsen/cuda command"); + + // Berendsen thermostat should be applied every step + + nevery = 1; + + t_start = atof(arg[3]); + t_stop = atof(arg[4]); + t_period = atof(arg[5]); + + // error checks + + if (t_period <= 0.0) error->all("Fix temp/berendsen/cuda period must be > 0.0"); + + // create a new compute temp style + // id = fix-ID + temp, compute group = fix group + + int n = strlen(id) + 6; + id_temp = new char[n]; + strcpy(id_temp,id); + strcat(id_temp,"_temp"); + + char **newarg = new char*[3]; + newarg[0] = id_temp; + newarg[1] = group->names[igroup]; + newarg[2] = (char *) "temp/cuda"; + modify->add_compute(3,newarg); + delete [] newarg; + tflag = 1; +} + +/* ---------------------------------------------------------------------- */ + +FixTempBerendsenCuda::~FixTempBerendsenCuda() +{ + // delete temperature if fix created it + + if (tflag) modify->delete_compute(id_temp); + delete [] id_temp; +} + +/* ---------------------------------------------------------------------- */ + +int FixTempBerendsenCuda::setmask() +{ + int mask = 0; + mask |= END_OF_STEP_CUDA; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixTempBerendsenCuda::init() +{ + int icompute = modify->find_compute(id_temp); + if (icompute < 0) + error->all("Temperature ID for fix temp/berendsen/cuda does not exist"); + temperature = modify->compute[icompute]; + if(not temperature->cudable) + error->warning("Fix temp/berendsen/cuda uses non cudable temperature compute"); + if (temperature->tempbias) which = BIAS; + else which = NOBIAS; + + //temperature->init(); //not in original berendsen possible error? +} + +/* ---------------------------------------------------------------------- */ + +void FixTempBerendsenCuda::end_of_step() +{ + double t_current; + if(not temperature->cudable) {cuda->cu_x->download();cuda->cu_v->download();} + t_current = temperature->compute_scalar(); + if (t_current == 0.0) + error->all("Computed temperature for fix temp/berendsen/cuda cannot be 0.0"); + + double delta = update->ntimestep - update->beginstep; + delta /= update->endstep - update->beginstep; + t_target = t_start + delta * (t_stop-t_start); + + // rescale velocities by lamda + + double lamda = sqrt(1.0 + update->dt/t_period*(t_target/t_current - 1.0)); + + double **v = atom->v; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + if (which == NOBIAS) { + Cuda_FixTempBerendsenCuda_EndOfStep(&cuda->shared_data, groupbit,lamda); + + } else { + if(not temperature->cudable) + { + cuda->cu_x->download();cuda->cu_v->download(); + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + temperature->remove_bias(i,v[i]); + v[i][0] *= lamda; + v[i][1] *= lamda; + v[i][2] *= lamda; + temperature->restore_bias(i,v[i]); + } + } + cuda->cu_v->upload(); + } + else + { + temperature->remove_bias_all(); + Cuda_FixTempBerendsenCuda_EndOfStep(&cuda->shared_data, groupbit,lamda); + temperature->restore_bias_all(); + } + } + + +} + +/* ---------------------------------------------------------------------- */ + +int FixTempBerendsenCuda::modify_param(int narg, char **arg) +{ + if (strcmp(arg[0],"temp") == 0) { + if (narg < 2) error->all("Illegal fix_modify command"); + if (tflag) { + modify->delete_compute(id_temp); + tflag = 0; + } + delete [] id_temp; + int n = strlen(arg[1]) + 1; + id_temp = new char[n]; + strcpy(id_temp,arg[1]); + + int icompute = modify->find_compute(id_temp); + if (icompute < 0) error->all("Could not find fix_modify temperature ID"); + temperature = modify->compute[icompute]; + + if (temperature->tempflag == 0) + error->all("Fix_modify temperature ID does not compute temperature"); + if (temperature->igroup != igroup && comm->me == 0) + error->warning("Group for fix_modify temp != fix group"); + return 2; + } + return 0; +} + + +/* ---------------------------------------------------------------------- */ + +void FixTempBerendsenCuda::reset_target(double t_new) +{ + t_start = t_stop = t_new; +} + + + diff --git a/src/USER-CUDA/fix_temp_berendsen_cuda.h b/src/USER-CUDA/fix_temp_berendsen_cuda.h new file mode 100644 index 0000000000..391cd07e72 --- /dev/null +++ b/src/USER-CUDA/fix_temp_berendsen_cuda.h @@ -0,0 +1,58 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#ifdef FIX_CLASS + +FixStyle(temp/berendsen/cuda,FixTempBerendsenCuda) + +#else + +#ifndef LMP_FIX_TEMP_BERENDSEN_CUDA_H +#define LMP_FIX_TEMP_BERENDSEN_CUDA_H + +#include "fix.h" + +namespace LAMMPS_NS { +class FixTempBerendsenCuda : public Fix { + public: + FixTempBerendsenCuda(class LAMMPS *, int, char **); + ~FixTempBerendsenCuda(); + int setmask(); + void init(); + void end_of_step(); + int modify_param(int, char **); + void reset_target(double); + + private: + class Cuda *cuda; + int which; + double t_start,t_stop,t_target,t_period; + + char *id_temp; + class Compute *temperature; + int tflag; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_temp_rescale_cuda.cpp b/src/USER-CUDA/fix_temp_rescale_cuda.cpp new file mode 100644 index 0000000000..42f038c6b0 --- /dev/null +++ b/src/USER-CUDA/fix_temp_rescale_cuda.cpp @@ -0,0 +1,222 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "fix_temp_rescale_cuda.h" +#include "fix_temp_rescale_cuda_cu.h" +#include "atom.h" +#include "force.h" +#include "group.h" +#include "update.h" +#include "domain.h" +#include "region.h" +#include "comm.h" +#include "modify.h" +#include "compute.h" +#include "error.h" +#include "cuda.h" +#include "cuda_modify_flags.h" + +using namespace LAMMPS_NS; + +enum{NOBIAS,BIAS}; + +/* ---------------------------------------------------------------------- */ + +FixTempRescaleCuda::FixTempRescaleCuda(LAMMPS *lmp, int narg, char **arg) : + Fix(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (narg < 8) error->all("Illegal fix temp/rescale/cuda command"); + + nevery = atoi(arg[3]); + if (nevery <= 0) error->all("Illegal fix temp/rescale/cuda command"); + + scalar_flag = 1; + global_freq = nevery; + extscalar = 1; + + t_start = atof(arg[4]); + t_stop = atof(arg[5]); + t_window = atof(arg[6]); + fraction = atof(arg[7]); + + // create a new compute temp + // id = fix-ID + temp, compute group = fix group + + int n = strlen(id) + 6; + id_temp = new char[n]; + strcpy(id_temp,id); + strcat(id_temp,"_temp"); + + char **newarg = new char*[6]; + newarg[0] = id_temp; + newarg[1] = group->names[igroup]; + newarg[2] = (char *) "temp/cuda"; + modify->add_compute(3,newarg); + delete [] newarg; + tflag = 1; + + energy = 0.0; +} + +/* ---------------------------------------------------------------------- */ + +FixTempRescaleCuda::~FixTempRescaleCuda() +{ + // delete temperature if fix created it + + if (tflag) modify->delete_compute(id_temp); + delete [] id_temp; +} + +/* ---------------------------------------------------------------------- */ + +int FixTempRescaleCuda::setmask() +{ + int mask = 0; + mask |= END_OF_STEP_CUDA; + mask |= THERMO_ENERGY_CUDA; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixTempRescaleCuda::init() +{ + int icompute = modify->find_compute(id_temp); + if (icompute < 0) + error->all("Temperature ID for fix temp/rescale/cuda does not exist"); + temperature = modify->compute[icompute]; + if(not temperature->cudable) + error->warning("Fix temp/rescale/cuda uses non cudable temperature compute"); + if (temperature->tempbias) which = BIAS; + else which = NOBIAS; +} + +/* ---------------------------------------------------------------------- */ + +void FixTempRescaleCuda::end_of_step() +{ + double t_current; + if(not temperature->cudable) {cuda->cu_x->download();cuda->cu_v->download();} + t_current = temperature->compute_scalar(); + if (t_current == 0.0) + error->all("Computed temperature for fix temp/rescale/cuda cannot be 0.0"); + + double delta = update->ntimestep - update->beginstep; + delta /= update->endstep - update->beginstep; + double t_target = t_start + delta * (t_stop-t_start); + + // rescale velocity of appropriate atoms if outside window + + if (fabs(t_current-t_target) > t_window) { + t_target = t_current - fraction*(t_current-t_target); + double factor = sqrt(t_target/t_current); + double efactor = 0.5 * force->boltz * temperature->dof; + + double **v = atom->v; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + if (which == NOBIAS) { + energy += (t_current-t_target) * efactor; + + Cuda_FixTempRescaleCuda_EndOfStep(&cuda->shared_data, groupbit,factor); + + } else if (which == BIAS) { + energy += (t_current-t_target) * efactor; + if(not temperature->cudable) + { + cuda->cu_x->download();cuda->cu_v->download(); + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + temperature->remove_bias(i,v[i]); + v[i][0] *= factor; + v[i][1] *= factor; + v[i][2] *= factor; + temperature->restore_bias(i,v[i]); + } + } + cuda->cu_v->upload(); + } + else + { + temperature->remove_bias_all(); + Cuda_FixTempRescaleCuda_EndOfStep(&cuda->shared_data, groupbit,factor); + temperature->restore_bias_all(); + } + } + + } +} + +/* ---------------------------------------------------------------------- */ + +int FixTempRescaleCuda::modify_param(int narg, char **arg) +{ + if (strcmp(arg[0],"temp") == 0) { + if (narg < 2) error->all("Illegal fix_modify command"); + if (tflag) { + modify->delete_compute(id_temp); + tflag = 0; + } + delete [] id_temp; + int n = strlen(arg[1]) + 1; + id_temp = new char[n]; + strcpy(id_temp,arg[1]); + + int icompute = modify->find_compute(id_temp); + if (icompute < 0) error->all("Could not find fix_modify temperature ID"); + temperature = modify->compute[icompute]; + + if (temperature->tempflag == 0) + error->all("Fix_modify temperature ID does not compute temperature"); + if (temperature->igroup != igroup && comm->me == 0) + error->warning("Group for fix_modify temp != fix group"); + if(not temperature->cudable) + error->warning("Fix temp/rescale/cuda uses non cudable temperature compute"); + return 2; + } + return 0; +} + + +/* ---------------------------------------------------------------------- */ + +void FixTempRescaleCuda::reset_target(double t_new) +{ + t_start = t_stop = t_new; +} + +/* ---------------------------------------------------------------------- */ + +double FixTempRescaleCuda::compute_scalar() +{ + return energy; +} diff --git a/src/USER-CUDA/fix_temp_rescale_cuda.h b/src/USER-CUDA/fix_temp_rescale_cuda.h new file mode 100644 index 0000000000..75876e60d3 --- /dev/null +++ b/src/USER-CUDA/fix_temp_rescale_cuda.h @@ -0,0 +1,61 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(temp/rescale/cuda,FixTempRescaleCuda) + +#else + +#ifndef FIX_TEMP_RESCALE_CUDA_H +#define FIX_TEMP_RESCALE_CUDA_H + +#include "fix.h" + +namespace LAMMPS_NS { +class FixTempRescaleCuda : public Fix { + public: + FixTempRescaleCuda(class LAMMPS *, int, char **); + ~FixTempRescaleCuda(); + int setmask(); + void init(); + void end_of_step(); + int modify_param(int, char **); + void reset_target(double); + double compute_scalar(); + + private: + class Cuda *cuda; + int which; + double t_start,t_stop,t_window; + double fraction,energy,efactor; + + char *id_temp; + class Compute *temperature; + int tflag; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp b/src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp new file mode 100644 index 0000000000..c8730a1728 --- /dev/null +++ b/src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp @@ -0,0 +1,237 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "fix_temp_rescale_limit_cuda.h" +#include "fix_temp_rescale_limit_cuda_cu.h" +#include "atom.h" +#include "force.h" +#include "group.h" +#include "update.h" +#include "domain.h" +#include "region.h" +#include "comm.h" +#include "modify.h" +#include "compute.h" +#include "error.h" +#include "cuda.h" +#include "cuda_modify_flags.h" + +using namespace LAMMPS_NS; +#define MIN(A,B) ((A) < (B)) ? (A) : (B) +#define MAX(A,B) ((A) > (B)) ? (A) : (B) + +enum{NOBIAS,BIAS}; + +/* ---------------------------------------------------------------------- */ + +FixTempRescaleLimitCuda::FixTempRescaleLimitCuda(LAMMPS *lmp, int narg, char **arg) : + Fix(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if (narg < 9) error->all("Illegal fix temp/rescale/limit/cuda command"); + + nevery = atoi(arg[3]); + if (nevery <= 0) error->all("Illegal fix temp/rescale/limit/cuda command"); + + scalar_flag = 1; + global_freq = nevery; + extscalar = 1; + + t_start = atof(arg[4]); + t_stop = atof(arg[5]); + t_window = atof(arg[6]); + fraction = atof(arg[7]); + limit = atof(arg[8]); + if (limit <= 1.0) error->all("Illegal fix temp/rescale/limit/cuda command (limit must be > 1.0)"); + + + // create a new compute temp + // id = fix-ID + temp, compute group = fix group + + int n = strlen(id) + 6; + id_temp = new char[n]; + strcpy(id_temp,id); + strcat(id_temp,"_temp"); + + char **newarg = new char*[6]; + newarg[0] = id_temp; + newarg[1] = group->names[igroup]; + newarg[2] = (char *) "temp/cuda"; + modify->add_compute(3,newarg); + delete [] newarg; + tflag = 1; + + energy = 0.0; +} + +/* ---------------------------------------------------------------------- */ + +FixTempRescaleLimitCuda::~FixTempRescaleLimitCuda() +{ + // delete temperature if fix created it + + if (tflag) modify->delete_compute(id_temp); + delete [] id_temp; +} + +/* ---------------------------------------------------------------------- */ + +int FixTempRescaleLimitCuda::setmask() +{ + int mask = 0; + mask |= END_OF_STEP_CUDA; + mask |= THERMO_ENERGY_CUDA; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixTempRescaleLimitCuda::init() +{ + int icompute = modify->find_compute(id_temp); + if (icompute < 0) + error->all("Temperature ID for fix temp/rescale/limit/cuda does not exist"); + temperature = modify->compute[icompute]; + if(not temperature->cudable) + error->warning("Fix temp/rescale/limit/cuda uses non cudable temperature compute"); + if (temperature->tempbias) which = BIAS; + else which = NOBIAS; +} + +/* ---------------------------------------------------------------------- */ + +void FixTempRescaleLimitCuda::end_of_step() +{ + double t_current; + if(not temperature->cudable) {cuda->cu_x->download();cuda->cu_v->download();} + t_current = temperature->compute_scalar(); + if (t_current == 0.0) + error->all("Computed temperature for fix temp/rescale/limit/cuda cannot be 0.0"); + + double delta = update->ntimestep - update->beginstep; + delta /= update->endstep - update->beginstep; + double t_target = t_start + delta * (t_stop-t_start); + + // rescale velocity of appropriate atoms if outside window + + if (fabs(t_current-t_target) > t_window) { + t_target = t_current - fraction*(t_current-t_target); + double factor = sqrt(t_target/t_current); + double efactor = 0.5 * force->boltz * temperature->dof; + + double **v = atom->v; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + double massone; + if(atom->rmass) massone = atom->rmass[0]; + else massone = atom->mass[0]; + + double current_limit=sqrt(limit*force->boltz*t_target*temperature->dof/massone/force->mvv2e); + if (which == NOBIAS) { + energy += (t_current-t_target) * efactor; + + + Cuda_FixTempRescaleLimitCuda_EndOfStep(&cuda->shared_data, groupbit,factor,current_limit); + + } else if (which == BIAS) { + energy += (t_current-t_target) * efactor; + if(not temperature->cudable) + { + cuda->cu_x->download();cuda->cu_v->download(); + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + temperature->remove_bias(i,v[i]); + double vx = v[i][0] * factor; + double vy = v[i][1] * factor; + double vz = v[i][2] * factor; + v[i][0]=vx>0?MIN(vx,current_limit):MAX(vx,-current_limit); + v[i][1]=vy>0?MIN(vy,current_limit):MAX(vy,-current_limit); + v[i][2]=vz>0?MIN(vz,current_limit):MAX(vz,-current_limit); + + temperature->restore_bias(i,v[i]); + } + } + cuda->cu_v->upload(); + } + else + { + temperature->remove_bias_all(); + Cuda_FixTempRescaleLimitCuda_EndOfStep(&cuda->shared_data, groupbit,factor,current_limit); + temperature->restore_bias_all(); + } + } + + } +} + +/* ---------------------------------------------------------------------- */ + +int FixTempRescaleLimitCuda::modify_param(int narg, char **arg) +{ + if (strcmp(arg[0],"temp") == 0) { + if (narg < 2) error->all("Illegal fix_modify command"); + if (tflag) { + modify->delete_compute(id_temp); + tflag = 0; + } + delete [] id_temp; + int n = strlen(arg[1]) + 1; + id_temp = new char[n]; + strcpy(id_temp,arg[1]); + + int icompute = modify->find_compute(id_temp); + if (icompute < 0) error->all("Could not find fix_modify temperature ID"); + temperature = modify->compute[icompute]; + + if (temperature->tempflag == 0) + error->all("Fix_modify temperature ID does not compute temperature"); + if (temperature->igroup != igroup && comm->me == 0) + error->warning("Group for fix_modify temp != fix group"); + if(not temperature->cudable) + error->warning("Fix temp/rescale/limit/cuda uses non cudable temperature compute"); + return 2; + } + return 0; +} + + +/* ---------------------------------------------------------------------- */ + +void FixTempRescaleLimitCuda::reset_target(double t_new) +{ + t_start = t_stop = t_new; +} + +/* ---------------------------------------------------------------------- */ + +double FixTempRescaleLimitCuda::compute_scalar() +{ + return energy; +} diff --git a/src/USER-CUDA/fix_temp_rescale_limit_cuda.h b/src/USER-CUDA/fix_temp_rescale_limit_cuda.h new file mode 100644 index 0000000000..7ee49d3c40 --- /dev/null +++ b/src/USER-CUDA/fix_temp_rescale_limit_cuda.h @@ -0,0 +1,61 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(temp/rescale/limit/cuda,FixTempRescaleLimitCuda) + +#else + +#ifndef FIX_TEMP_RESCALE_LIMIT_CUDA_H +#define FIX_TEMP_RESCALE_LIMIT_CUDA_H + +#include "fix.h" + +namespace LAMMPS_NS { +class FixTempRescaleLimitCuda : public Fix { + public: + FixTempRescaleLimitCuda(class LAMMPS *, int, char **); + ~FixTempRescaleLimitCuda(); + int setmask(); + void init(); + void end_of_step(); + int modify_param(int, char **); + void reset_target(double); + double compute_scalar(); + + private: + class Cuda *cuda; + int which; + double t_start,t_stop,t_window; + double fraction,energy,efactor; + double limit; + char *id_temp; + class Compute *temperature; + int tflag; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/fix_viscous_cuda.cpp b/src/USER-CUDA/fix_viscous_cuda.cpp new file mode 100644 index 0000000000..c167105027 --- /dev/null +++ b/src/USER-CUDA/fix_viscous_cuda.cpp @@ -0,0 +1,103 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "fix_viscous_cuda.h" +#include "fix_viscous_cuda_cu.h" +#include "atom.h" +#include "update.h" +#include "respa.h" +#include "error.h" +#include "cuda_modify_flags.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +FixViscousCuda::FixViscousCuda(LAMMPS *lmp, int narg, char **arg) : + FixViscous(lmp, narg, arg) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + cu_gamma=NULL; +} + +/* ---------------------------------------------------------------------- */ + +FixViscousCuda::~FixViscousCuda() +{ + delete cu_gamma; +} + +/* ---------------------------------------------------------------------- */ + +int FixViscousCuda::setmask() +{ + int mask = 0; + mask |= POST_FORCE_CUDA; + // mask |= POST_FORCE_RESPA; + // mask |= MIN_POST_FORCE; + return mask; +} + + +/* ---------------------------------------------------------------------- */ + +void FixViscousCuda::setup(int vflag) +{ + if(not cu_gamma) + cu_gamma = new cCudaData (gamma,atom->ntypes+1); + Cuda_FixViscousCuda_Init(&cuda->shared_data); + cu_gamma->upload(); + // if (strcmp(update->integrate_style,"verlet/cuda") == 0) + post_force(vflag); + /* else { + ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1); + post_force_respa(vflag,nlevels_respa-1,0); + ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1); + }*/ +} + +/* ---------------------------------------------------------------------- */ + +void FixViscousCuda::min_setup(int vflag) +{ + Cuda_FixViscousCuda_Init(&cuda->shared_data); + post_force(vflag); +} + +/* ---------------------------------------------------------------------- */ + +void FixViscousCuda::post_force(int vflag) +{ + // apply drag force to atoms in group + // direction is opposed to velocity vector + // magnitude depends on atom type + + Cuda_FixViscousCuda_PostForce(&cuda->shared_data, groupbit,cu_gamma->dev_data()); +} diff --git a/src/USER-CUDA/fix_viscous_cuda.h b/src/USER-CUDA/fix_viscous_cuda.h new file mode 100644 index 0000000000..54e75cc0d2 --- /dev/null +++ b/src/USER-CUDA/fix_viscous_cuda.h @@ -0,0 +1,55 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(viscous/cuda,FixViscousCuda) + +#else + +#ifndef LMP_FIX_VISCOUS_CUDA_H +#define LMP_FIX_VISCOUS_CUDA_H + +#include "fix_viscous.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class FixViscousCuda : public FixViscous { + public: + FixViscousCuda(class LAMMPS *, int, char **); + ~FixViscousCuda(); + int setmask(); + void setup(int); + void min_setup(int); + void post_force(int); + cCudaData* cu_gamma; + + private: + class Cuda *cuda; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/modify_cuda.cpp b/src/USER-CUDA/modify_cuda.cpp index 7f8d7f8c5a..9f1716ac7a 100644 --- a/src/USER-CUDA/modify_cuda.cpp +++ b/src/USER-CUDA/modify_cuda.cpp @@ -63,6 +63,8 @@ using namespace LAMMPS_NS; ModifyCuda::ModifyCuda(LAMMPS *lmp) : Modify(lmp) { cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); n_initial_integrate_cuda = 0; n_post_integrate_cuda = 0; diff --git a/src/USER-CUDA/neigh_full_cuda.cpp b/src/USER-CUDA/neigh_full_cuda.cpp index 49678c1d06..197b62a0ac 100644 --- a/src/USER-CUDA/neigh_full_cuda.cpp +++ b/src/USER-CUDA/neigh_full_cuda.cpp @@ -21,7 +21,6 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -#ifdef CUDA #include "neighbor_cuda.h" #include "neigh_list.h" #include "atom.h" @@ -313,5 +312,4 @@ return; MYDBG(printf(" # CUDA::NeighFullNSQCuda ... end\n");) */ } -#endif diff --git a/src/USER-CUDA/neighbor_cuda.cpp b/src/USER-CUDA/neighbor_cuda.cpp index 4575ce2acc..9626650ee8 100644 --- a/src/USER-CUDA/neighbor_cuda.cpp +++ b/src/USER-CUDA/neighbor_cuda.cpp @@ -36,6 +36,8 @@ enum{NSQ,BIN,MULTI}; // also in neigh_list.cpp NeighborCuda::NeighborCuda(LAMMPS *lmp) : Neighbor(lmp) { cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); } /* ---------------------------------------------------------------------- */ diff --git a/src/USER-CUDA/pair_born_coul_long_cuda.cpp b/src/USER-CUDA/pair_born_coul_long_cuda.cpp new file mode 100644 index 0000000000..fa19e5c9a2 --- /dev/null +++ b/src/USER-CUDA/pair_born_coul_long_cuda.cpp @@ -0,0 +1,186 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_born_coul_long_cuda.h" +#include "pair_born_coul_long_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 +/* ---------------------------------------------------------------------- */ + +PairBornCoulLongCuda::PairBornCoulLongCuda(LAMMPS *lmp) : PairBornCoulLong(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->shared_data.pair.use_block_per_atom = 0; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairBornCoulLongCuda::allocate() +{ + if(! allocated) PairBornCoulLong::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut_lj; + cuda->shared_data.pair.coeff1 = rhoinv; + cuda->shared_data.pair.coeff2 = sigma; + cuda->shared_data.pair.coeff3 = a; + cuda->shared_data.pair.coeff4 = c; + cuda->shared_data.pair.coeff5 = d; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairBornCoulLongCuda::compute(int eflag, int vflag) +{ + MYDBG( printf("PairBornCoulLongCuda compute start\n"); fflush(stdout);) + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + #ifdef CUDA_USE_BINNING + Cuda_PairBornCoulLongCuda(& cuda->shared_data, eflag, vflag); + #else + Cuda_PairBornCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + #endif + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } + MYDBG( printf("PairBornCoulLongCuda compute end\n"); fflush(stdout);) +} + +/* ---------------------------------------------------------------------- */ + +void PairBornCoulLongCuda::settings(int narg, char **arg) +{ + PairBornCoulLong::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairBornCoulLongCuda::coeff(int narg, char **arg) +{ + PairBornCoulLong::coeff(narg, arg); + allocate(); +} + +void PairBornCoulLongCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style born/coul/long requires atom attribute q"); + // request regular or rRESPA neighbor lists + + int irequest; + + if (strcmp(update->integrate_style,"respa") == 0) error->all("Integrate Style Respa is not supported by pair style buck/coul/long/cuda"); + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + + cut_coulsq = cut_coul * cut_coul; + cuda->shared_data.pair.cut_coulsq_global=cut_coulsq; + + if (force->kspace == NULL) + error->all("Pair style is incompatible with KSpace style"); + g_ewald = force->kspace->g_ewald; + cuda->shared_data.pair.g_ewald=g_ewald; + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + + + if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n"); +} + +void PairBornCoulLongCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairBornCoulLongCuda::init_list\n");) + PairBornCoulLong::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairBornCoulLongCuda::init_list end\n");) +} + +void PairBornCoulLongCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairBornCoulLong::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} diff --git a/src/USER-CUDA/pair_born_coul_long_cuda.h b/src/USER-CUDA/pair_born_coul_long_cuda.h new file mode 100644 index 0000000000..91f6f650ae --- /dev/null +++ b/src/USER-CUDA/pair_born_coul_long_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(born/coul/long/cuda,PairBornCoulLongCuda) + +#else + +#ifndef LMP_PAIR_BORN_COUL_LONG_CUDA_H +#define LMP_PAIR_BORN_COUL_LONG_CUDA_H + +#include "pair_born_coul_long.h" + +namespace LAMMPS_NS { + +class PairBornCoulLongCuda : public PairBornCoulLong +{ + public: + PairBornCoulLongCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_buck_coul_cut_cuda.cpp b/src/USER-CUDA/pair_buck_coul_cut_cuda.cpp new file mode 100644 index 0000000000..5d7fd4fc3f --- /dev/null +++ b/src/USER-CUDA/pair_buck_coul_cut_cuda.cpp @@ -0,0 +1,173 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_buck_coul_cut_cuda.h" +#include "pair_buck_coul_cut_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairBuckCoulCutCuda::PairBuckCoulCutCuda(LAMMPS *lmp) : PairBuckCoulCut(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->shared_data.pair.use_block_per_atom = 0; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairBuckCoulCutCuda::allocate() +{ + if(! allocated) PairBuckCoulCut::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut_coul = cut_coul; + cuda->shared_data.pair.cut = cut_lj; + cuda->shared_data.pair.coeff1 = rhoinv; + cuda->shared_data.pair.coeff2 = buck1; + cuda->shared_data.pair.coeff3 = buck2; + cuda->shared_data.pair.coeff4 = a; + cuda->shared_data.pair.coeff5 = c; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairBuckCoulCutCuda::compute(int eflag, int vflag) +{ + MYDBG( printf("PairBuckCoulCutCuda compute start\n"); fflush(stdout);) + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairBuckCoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } + MYDBG( printf("PairBuckCoulCutCuda compute end\n"); fflush(stdout);) +} + +/* ---------------------------------------------------------------------- */ + +void PairBuckCoulCutCuda::settings(int narg, char **arg) +{ + PairBuckCoulCut::settings(narg, arg); + cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global; + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairBuckCoulCutCuda::coeff(int narg, char **arg) +{ + PairBuckCoulCut::coeff(narg, arg); + allocate(); +} + +void PairBuckCoulCutCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style buck/coul/long requires atom attribute q"); + // request regular or rRESPA neighbor lists + + int irequest; + + if (strcmp(update->integrate_style,"respa") == 0) error->all("Integrate Style Respa is not supported by pair style buck/coul/long/cuda"); + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + + cuda->shared_data.pair.cut_coulsq_global=cut_coul_global * cut_coul_global; + + if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n"); +} + +void PairBuckCoulCutCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairBuckCoulCutCuda::init_list\n");) + PairBuckCoulCut::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairBuckCoulCutCuda::init_list end\n");) +} + +void PairBuckCoulCutCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairBuckCoulCut::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} diff --git a/src/USER-CUDA/pair_buck_coul_cut_cuda.h b/src/USER-CUDA/pair_buck_coul_cut_cuda.h new file mode 100644 index 0000000000..b46a18364a --- /dev/null +++ b/src/USER-CUDA/pair_buck_coul_cut_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(buck/coul/cut/cuda,PairBuckCoulCutCuda) + +#else + +#ifndef LMP_PAIR_BUCK_COUL_CUT_CUDA_H +#define LMP_PAIR_BUCK_COUL_CUT_CUDA_H + +#include "pair_buck_coul_cut.h" + +namespace LAMMPS_NS { + +class PairBuckCoulCutCuda : public PairBuckCoulCut +{ + public: + PairBuckCoulCutCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_buck_coul_long_cuda.cpp b/src/USER-CUDA/pair_buck_coul_long_cuda.cpp new file mode 100644 index 0000000000..558d42a29d --- /dev/null +++ b/src/USER-CUDA/pair_buck_coul_long_cuda.cpp @@ -0,0 +1,184 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_buck_coul_long_cuda.h" +#include "pair_buck_coul_long_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 +/* ---------------------------------------------------------------------- */ + +PairBuckCoulLongCuda::PairBuckCoulLongCuda(LAMMPS *lmp) : PairBuckCoulLong(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->shared_data.pair.use_block_per_atom = 0; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairBuckCoulLongCuda::allocate() +{ + if(! allocated) PairBuckCoulLong::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut_lj; + cuda->shared_data.pair.coeff1 = rhoinv; + cuda->shared_data.pair.coeff2 = buck1; + cuda->shared_data.pair.coeff3 = buck2; + cuda->shared_data.pair.coeff4 = a; + cuda->shared_data.pair.coeff5 = c; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairBuckCoulLongCuda::compute(int eflag, int vflag) +{ + MYDBG( printf("PairBuckCoulLongCuda compute start\n"); fflush(stdout);) + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairBuckCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } + MYDBG( printf("PairBuckCoulLongCuda compute end\n"); fflush(stdout);) +} + +/* ---------------------------------------------------------------------- */ + +void PairBuckCoulLongCuda::settings(int narg, char **arg) +{ + PairBuckCoulLong::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairBuckCoulLongCuda::coeff(int narg, char **arg) +{ + PairBuckCoulLong::coeff(narg, arg); + allocate(); +} + +void PairBuckCoulLongCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style buck/coul/long requires atom attribute q"); + // request regular or rRESPA neighbor lists + + int irequest; + + if (strcmp(update->integrate_style,"respa") == 0) error->all("Integrate Style Respa is not supported by pair style buck/coul/long/cuda"); + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + + cut_coulsq = cut_coul * cut_coul; + cuda->shared_data.pair.cut_coulsq_global=cut_coulsq; + + if (force->kspace == NULL) + error->all("Pair style is incompatible with KSpace style"); + g_ewald = force->kspace->g_ewald; + cuda->shared_data.pair.g_ewald=g_ewald; + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + + + if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n"); +} + +void PairBuckCoulLongCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairBuckCoulLongCuda::init_list\n");) + PairBuckCoulLong::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairBuckCoulLongCuda::init_list end\n");) +} + +void PairBuckCoulLongCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairBuckCoulLong::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} diff --git a/src/USER-CUDA/pair_buck_coul_long_cuda.h b/src/USER-CUDA/pair_buck_coul_long_cuda.h new file mode 100644 index 0000000000..39a3791031 --- /dev/null +++ b/src/USER-CUDA/pair_buck_coul_long_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(buck/coul/long/cuda,PairBuckCoulLongCuda) + +#else + +#ifndef LMP_PAIR_BUCK_COUL_LONG_CUDA_H +#define LMP_PAIR_BUCK_COUL_LONG_CUDA_H + +#include "pair_buck_coul_long.h" + +namespace LAMMPS_NS { + +class PairBuckCoulLongCuda : public PairBuckCoulLong +{ + public: + PairBuckCoulLongCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_buck_cuda.cpp b/src/USER-CUDA/pair_buck_cuda.cpp new file mode 100644 index 0000000000..b8f164b923 --- /dev/null +++ b/src/USER-CUDA/pair_buck_cuda.cpp @@ -0,0 +1,169 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_buck_cuda.h" +#include "pair_buck_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairBuckCuda::PairBuckCuda(LAMMPS *lmp) : PairBuck(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->shared_data.pair.use_block_per_atom = 0; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairBuckCuda::allocate() +{ + if(! allocated) PairBuck::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut; + cuda->shared_data.pair.coeff1 = rhoinv; + cuda->shared_data.pair.coeff2 = buck1; + cuda->shared_data.pair.coeff3 = buck2; + cuda->shared_data.pair.coeff4 = a; + cuda->shared_data.pair.coeff5 = c; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairBuckCuda::compute(int eflag, int vflag) +{ + MYDBG( printf("PairBuckCuda compute start\n"); fflush(stdout);) + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairBuckCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } + MYDBG( printf("PairBuckCuda compute end\n"); fflush(stdout);) +} + +/* ---------------------------------------------------------------------- */ + +void PairBuckCuda::settings(int narg, char **arg) +{ + PairBuck::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairBuckCuda::coeff(int narg, char **arg) +{ + PairBuck::coeff(narg, arg); + allocate(); +} + +void PairBuckCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style buck/coul/long requires atom attribute q"); + // request regular or rRESPA neighbor lists + + int irequest; + + if (strcmp(update->integrate_style,"respa") == 0) error->all("Integrate Style Respa is not supported by pair style buck/coul/long/cuda"); + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + + + if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n"); +} + +void PairBuckCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairBuckCuda::init_list\n");) + PairBuck::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairBuckCuda::init_list end\n");) +} + +void PairBuckCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairBuck::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} diff --git a/src/USER-CUDA/pair_buck_cuda.h b/src/USER-CUDA/pair_buck_cuda.h new file mode 100644 index 0000000000..9ec29e1662 --- /dev/null +++ b/src/USER-CUDA/pair_buck_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(buck/cuda,PairBuckCuda) + +#else + +#ifndef LMP_PAIR_BUCK_CUDA_H +#define LMP_PAIR_BUCK_CUDA_H + +#include "pair_buck.h" + +namespace LAMMPS_NS { + +class PairBuckCuda : public PairBuck +{ + public: + PairBuckCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.cpp b/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.cpp new file mode 100644 index 0000000000..f0b50469ce --- /dev/null +++ b/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.cpp @@ -0,0 +1,204 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_cg_cmm_coul_cut_cuda.h" +#include "pair_cg_cmm_coul_cut_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairCGCMMCoulCutCuda::PairCGCMMCoulCutCuda(LAMMPS *lmp) : PairCGCMMCoulCut(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cg_type_double = NULL; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairCGCMMCoulCutCuda::allocate() +{ + if(! allocated) PairCGCMMCoulCut::allocate(); + int n = atom->ntypes; + if(! allocated2) + { + allocated2 = true; + + + memory->create(cg_type_double,n+1,n+1,"paircg:cgtypedouble"); + + cuda->shared_data.pair.cut = cut_lj; + cuda->shared_data.pair.cut_coul= cut_coul; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.coeff5 = cg_type_double; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } + for (int i = 1; i <= n; i++) { + for (int j = i; j <= n; j++) { + cg_type_double[i][j] = cg_type[i][j]; + cg_type_double[j][i] = cg_type[i][j]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCoulCutCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairCGCMMCoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } + +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCoulCutCuda::settings(int narg, char **arg) +{ + PairCGCMMCoulCut::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global; + cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global; + cuda->shared_data.pair.kappa = (F_FLOAT) kappa; +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCoulCutCuda::coeff(int narg, char **arg) +{ + PairCGCMMCoulCut::coeff(narg, arg); + allocate(); +} + +void PairCGCMMCoulCutCuda::init_style() +{ + MYDBG(printf("# CUDA PairCGCMMCoulCutCuda::init_style start\n"); ) + // request regular or rRESPA neighbor lists + + int irequest; + + if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) { + + } + else + { + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + //neighbor->style=0; //0=NSQ neighboring + } + + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + cut_respa=NULL; + if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster."); + + MYDBG(printf("# CUDA PairCGCMMCoulCutCuda::init_style end\n"); ) +} + +void PairCGCMMCoulCutCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairCGCMMCoulCutCuda::init_list\n");) + PairCGCMMCoulCut::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairCGCMMCoulCutCuda::init_list end\n");) +} + +void PairCGCMMCoulCutCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairCGCMMCoulCut::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.h b/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.h new file mode 100644 index 0000000000..467b8a3feb --- /dev/null +++ b/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.h @@ -0,0 +1,58 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(cg/cmm/coul/cut/cuda,PairCGCMMCoulCutCuda) + +#else + +#ifndef PAIR_CG_CMM_COUL_CUT_CUDA_H +#define PAIR_CG_CMM_COUL_CUT_CUDA_H + +#include "pair_cg_cmm_coul_cut.h" + +namespace LAMMPS_NS { + +class PairCGCMMCoulCutCuda : public PairCGCMMCoulCut +{ + public: + PairCGCMMCoulCutCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; + double** cg_type_double; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.cpp b/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.cpp new file mode 100644 index 0000000000..43f0f22b09 --- /dev/null +++ b/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.cpp @@ -0,0 +1,204 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_cg_cmm_coul_debye_cuda.h" +#include "pair_cg_cmm_coul_debye_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairCGCMMCoulDebyeCuda::PairCGCMMCoulDebyeCuda(LAMMPS *lmp) : PairCGCMMCoulCut(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cg_type_double = NULL; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairCGCMMCoulDebyeCuda::allocate() +{ + if(! allocated) PairCGCMMCoulCut::allocate(); + int n = atom->ntypes; + if(! allocated2) + { + allocated2 = true; + + + memory->create(cg_type_double,n+1,n+1,"paircg:cgtypedouble"); + + cuda->shared_data.pair.cut = cut_lj; + cuda->shared_data.pair.cut_coul= cut_coul; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.coeff5 = cg_type_double; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } + for (int i = 1; i <= n; i++) { + for (int j = i; j <= n; j++) { + cg_type_double[i][j] = cg_type[i][j]; + cg_type_double[j][i] = cg_type[i][j]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCoulDebyeCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairCGCMMCoulDebyeCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } + +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCoulDebyeCuda::settings(int narg, char **arg) +{ + PairCGCMMCoulCut::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global; + cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global; + cuda->shared_data.pair.kappa = (F_FLOAT) kappa; +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCoulDebyeCuda::coeff(int narg, char **arg) +{ + PairCGCMMCoulCut::coeff(narg, arg); + allocate(); +} + +void PairCGCMMCoulDebyeCuda::init_style() +{ + MYDBG(printf("# CUDA PairCGCMMCoulDebyeCuda::init_style start\n"); ) + // request regular or rRESPA neighbor lists + + int irequest; + + if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) { + + } + else + { + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + //neighbor->style=0; //0=NSQ neighboring + } + + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + cut_respa=NULL; + if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster."); + + MYDBG(printf("# CUDA PairCGCMMCoulDebyeCuda::init_style end\n"); ) +} + +void PairCGCMMCoulDebyeCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairCGCMMCoulDebyeCuda::init_list\n");) + PairCGCMMCoulCut::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairCGCMMCoulDebyeCuda::init_list end\n");) +} + +void PairCGCMMCoulDebyeCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairCGCMMCoulCut::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.h b/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.h new file mode 100644 index 0000000000..a392125161 --- /dev/null +++ b/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.h @@ -0,0 +1,58 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(cg/cmm/coul/debye/cuda,PairCGCMMCoulDebyeCuda) + +#else + +#ifndef PAIR_CG_CMM_COUL_DEBYE_CUDA_H +#define PAIR_CG_CMM_COUL_DEBYE_CUDA_H + +#include "pair_cg_cmm_coul_cut.h" + +namespace LAMMPS_NS { + +class PairCGCMMCoulDebyeCuda : public PairCGCMMCoulCut +{ + public: + PairCGCMMCoulDebyeCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; + double** cg_type_double; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.cpp b/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.cpp new file mode 100644 index 0000000000..680daaf1c0 --- /dev/null +++ b/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.cpp @@ -0,0 +1,206 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_cg_cmm_coul_long_cuda.h" +#include "pair_cg_cmm_coul_long_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairCGCMMCoulLongCuda::PairCGCMMCoulLongCuda(LAMMPS *lmp) : PairCGCMMCoulLong(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cg_type_double = NULL; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairCGCMMCoulLongCuda::allocate() +{ + if(! allocated) PairCGCMMCoulLong::allocate(); + int n = atom->ntypes; + if(! allocated2) + { + allocated2 = true; + + + memory->create(cg_type_double,n+1,n+1,"paircg:cgtypedouble"); + + cuda->shared_data.pair.cut = cut_lj; + cuda->shared_data.pair.cut_coul= cut_coul; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.coeff5 = cg_type_double; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } + for (int i = 1; i <= n; i++) { + for (int j = i; j <= n; j++) { + cg_type_double[i][j] = cg_type[i][j]; + cg_type_double[j][i] = cg_type[i][j]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCoulLongCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairCGCMMCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } + +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCoulLongCuda::settings(int narg, char **arg) +{ + PairCGCMMCoulLong::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global; + cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global; + cuda->shared_data.pair.kappa = (F_FLOAT) kappa; +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCoulLongCuda::coeff(int narg, char **arg) +{ + PairCGCMMCoulLong::coeff(narg, arg); + allocate(); +} + +void PairCGCMMCoulLongCuda::init_style() +{ + MYDBG(printf("# CUDA PairCGCMMCoulLongCuda::init_style start\n"); ) + // request regular or rRESPA neighbor lists + + int irequest; + + if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) { + + } + else + { + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + //neighbor->style=0; //0=NSQ neighboring + } + + g_ewald = force->kspace->g_ewald; + cuda->shared_data.pair.g_ewald=g_ewald; + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + cut_respa=NULL; + if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster."); + MYDBG(printf("# CUDA PairCGCMMCoulLongCuda::init_style end\n"); ) +} + +void PairCGCMMCoulLongCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairCGCMMCoulLongCuda::init_list\n");) + PairCGCMMCoulLong::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairCGCMMCoulLongCuda::init_list end\n");) +} + +void PairCGCMMCoulLongCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairCGCMMCoulLong::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.h b/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.h new file mode 100644 index 0000000000..cad37fc8ca --- /dev/null +++ b/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.h @@ -0,0 +1,58 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(cg/cmm/coul/long/cuda,PairCGCMMCoulLongCuda) + +#else + +#ifndef PAIR_CG_CMM_COUL_LONG_CUDA_H +#define PAIR_CG_CMM_COUL_LONG_CUDA_H + +#include "pair_cg_cmm_coul_long.h" + +namespace LAMMPS_NS { + +class PairCGCMMCoulLongCuda : public PairCGCMMCoulLong +{ + public: + PairCGCMMCoulLongCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; + double** cg_type_double; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_cg_cmm_cuda.cpp b/src/USER-CUDA/pair_cg_cmm_cuda.cpp new file mode 100644 index 0000000000..faaf190b7a --- /dev/null +++ b/src/USER-CUDA/pair_cg_cmm_cuda.cpp @@ -0,0 +1,201 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_cg_cmm_cuda.h" +#include "pair_cg_cmm_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairCGCMMCuda::PairCGCMMCuda(LAMMPS *lmp) : PairCGCMM(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cg_type_double = NULL; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairCGCMMCuda::allocate() +{ + if(! allocated) PairCGCMM::allocate(); + int n = atom->ntypes; + if(! allocated2) + { + allocated2 = true; + + + memory->create(cg_type_double,n+1,n+1,"paircg:cgtypedouble"); + + cuda->shared_data.pair.cut = cut; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.coeff5 = cg_type_double; + /*cu_lj1_gm = new cCudaData ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj2_gm = new cCudaData ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj3_gm = new cCudaData ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj4_gm = new cCudaData ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_cg_type_double_gm = new cCudaData ((double*)cg_type_double, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1));*/ + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + } + for (int i = 1; i <= n; i++) { + for (int j = i; j <= n; j++) { + cg_type_double[i][j] = cg_type[i][j]; + cg_type_double[j][i] = cg_type[i][j]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairCGCMMCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(vflag) cuda->cu_virial->download(); + } + +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCuda::settings(int narg, char **arg) +{ + PairCGCMM::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCuda::coeff(int narg, char **arg) +{ + PairCGCMM::coeff(narg, arg); + allocate(); +} + +void PairCGCMMCuda::init_style() +{ + MYDBG(printf("# CUDA PairCGCMMCuda::init_style start\n"); ) + // request regular or rRESPA neighbor lists + + int irequest; + + if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) { + + } + else + { + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + //neighbor->style=0; //0=NSQ neighboring + } + + cut_respa=NULL; + + MYDBG(printf("# CUDA PairCGCMMCuda::init_style end\n"); ) +} + +void PairCGCMMCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairCGCMMCuda::init_list\n");) + PairCGCMM::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairCGCMMCuda::init_list end\n");) +} + +void PairCGCMMCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairCGCMM::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_cg_cmm_cuda.h b/src/USER-CUDA/pair_cg_cmm_cuda.h new file mode 100644 index 0000000000..74236b889f --- /dev/null +++ b/src/USER-CUDA/pair_cg_cmm_cuda.h @@ -0,0 +1,64 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(cg/cmm/cuda,PairCGCMMCuda) + +#else + +#ifndef PAIR_CG_CMM_CUDA_H +#define PAIR_CG_CMM_CUDA_H + +#include "pair_cg_cmm.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class PairCGCMMCuda : public PairCGCMM +{ + public: + PairCGCMMCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; + double** cg_type_double; + cCudaData* cu_lj1_gm; + cCudaData* cu_lj2_gm; + cCudaData* cu_lj3_gm; + cCudaData* cu_lj4_gm; + cCudaData* cu_cg_type_double_gm; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_eam_alloy_cuda.cpp b/src/USER-CUDA/pair_eam_alloy_cuda.cpp new file mode 100644 index 0000000000..238c7520d9 --- /dev/null +++ b/src/USER-CUDA/pair_eam_alloy_cuda.cpp @@ -0,0 +1,326 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Stephen Foiles (SNL), Murray Daw (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "pair_eam_alloy_cuda.h" +#include "atom.h" +#include "comm.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +#define MAXLINE 1024 + +/* ---------------------------------------------------------------------- */ + +PairEAMAlloyCuda::PairEAMAlloyCuda(LAMMPS *lmp) : PairEAMCuda(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + one_coeff = 1; +} + +/* ---------------------------------------------------------------------- + set coeffs for one or more type pairs + read DYNAMO setfl file +------------------------------------------------------------------------- */ + +void PairEAMAlloyCuda::coeff(int narg, char **arg) +{ + int i,j; + + if (!allocated) allocate(); + + if (narg != 3 + atom->ntypes) + error->all("Incorrect args for pair coefficients"); + + // insure I,J args are * * + + if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0) + error->all("Incorrect args for pair coefficients"); + + // read EAM setfl file + + if (setfl) { + for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i]; + delete [] setfl->elements; + delete [] setfl->mass; + memory->destroy(setfl->frho); + memory->destroy(setfl->rhor); + memory->destroy(setfl->z2r); + delete setfl; + } + setfl = new Setfl(); + read_file(arg[2]); + + // read args that map atom types to elements in potential file + // map[i] = which element the Ith atom type is, -1 if NULL + + for (i = 3; i < narg; i++) { + if (strcmp(arg[i],"NULL") == 0) { + map[i-2] = -1; + continue; + } + for (j = 0; j < setfl->nelements; j++) + if (strcmp(arg[i],setfl->elements[j]) == 0) break; + if (j < setfl->nelements) map[i-2] = j; + else error->all("No matching element in EAM potential file"); + } + + // clear setflag since coeff() called once with I,J = * * + + int n = atom->ntypes; + for (i = 1; i <= n; i++) + for (j = i; j <= n; j++) + setflag[i][j] = 0; + + // set setflag i,j for type pairs where both are mapped to elements + // set mass of atom type if i = j + + int count = 0; + for (i = 1; i <= n; i++) { + for (j = i; j <= n; j++) { + if (map[i] >= 0 && map[j] >= 0) { + setflag[i][j] = 1; + if (i == j) atom->set_mass(i,setfl->mass[map[i]]); + count++; + } + } + } + + if (count == 0) error->all("Incorrect args for pair coefficients"); +} + +/* ---------------------------------------------------------------------- + read a multi-element DYNAMO setfl file +------------------------------------------------------------------------- */ + +void PairEAMAlloyCuda::read_file(char *filename) +{ + Setfl *file = setfl; + + // open potential file + + int me = comm->me; + FILE *fptr; + char line[MAXLINE]; + + if (me == 0) { + fptr = fopen(filename,"r"); + if (fptr == NULL) { + char str[128]; + sprintf(str,"Cannot open EAM potential file %s",filename); + error->one(str); + } + } + + // read and broadcast header + // extract element names from nelements line + + int n; + if (me == 0) { + fgets(line,MAXLINE,fptr); + fgets(line,MAXLINE,fptr); + fgets(line,MAXLINE,fptr); + fgets(line,MAXLINE,fptr); + n = strlen(line) + 1; + } + MPI_Bcast(&n,1,MPI_INT,0,world); + MPI_Bcast(line,n,MPI_CHAR,0,world); + + sscanf(line,"%d",&file->nelements); + int nwords = atom->count_words(line); + if (nwords != file->nelements + 1) + error->all("Incorrect element names in EAM potential file"); + + char **words = new char*[file->nelements+1]; + nwords = 0; + char *first = strtok(line," \t\n\r\f"); + while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue; + + file->elements = new char*[file->nelements]; + for (int i = 0; i < file->nelements; i++) { + n = strlen(words[i]) + 1; + file->elements[i] = new char[n]; + strcpy(file->elements[i],words[i]); + } + delete [] words; + + if (me == 0) { + fgets(line,MAXLINE,fptr); + sscanf(line,"%d %lg %d %lg %lg", + &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut); + } + + MPI_Bcast(&file->nrho,1,MPI_INT,0,world); + MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world); + MPI_Bcast(&file->nr,1,MPI_INT,0,world); + MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world); + MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world); + + file->mass = new double[file->nelements]; + memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho"); + memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor"); + memory->create(file->z2r,file->nelements,file->nelements,file->nr+1, + "pair:z2r"); + int i,j,tmp; + for (i = 0; i < file->nelements; i++) { + if (me == 0) { + fgets(line,MAXLINE,fptr); + sscanf(line,"%d %lg",&tmp,&file->mass[i]); + } + MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world); + + if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]); + MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world); + if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]); + MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world); + } + + for (i = 0; i < file->nelements; i++) + for (j = 0; j <= i; j++) { + if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]); + MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world); + } + + // close the potential file + + if (me == 0) fclose(fptr); +} + +/* ---------------------------------------------------------------------- + copy read-in setfl potential to standard array format +------------------------------------------------------------------------- */ + +void PairEAMAlloyCuda::file2array() +{ + int i,j,m,n; + int ntypes = atom->ntypes; + + // set function params directly from setfl file + + nrho = setfl->nrho; + nr = setfl->nr; + drho = setfl->drho; + dr = setfl->dr; + + // ------------------------------------------------------------------ + // setup frho arrays + // ------------------------------------------------------------------ + + // allocate frho arrays + // nfrho = # of setfl elements + 1 for zero array + + nfrho = setfl->nelements + 1; + memory->destroy(frho); + memory->create(frho,nfrho,nrho+1,"pair:frho"); + + // copy each element's frho to global frho + + for (i = 0; i < setfl->nelements; i++) + for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m]; + + // add extra frho of zeroes for non-EAM types to point to (pair hybrid) + // this is necessary b/c fp is still computed for non-EAM atoms + + for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0; + + // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to + // if atom type doesn't point to element (non-EAM atom in pair hybrid) + // then map it to last frho array of zeroes + + for (i = 1; i <= ntypes; i++) + if (map[i] >= 0) type2frho[i] = map[i]; + else type2frho[i] = nfrho-1; + + // ------------------------------------------------------------------ + // setup rhor arrays + // ------------------------------------------------------------------ + + // allocate rhor arrays + // nrhor = # of setfl elements + + nrhor = setfl->nelements; + memory->destroy(rhor); + memory->create(rhor,nrhor,nr+1,"pair:rhor"); + + // copy each element's rhor to global rhor + + for (i = 0; i < setfl->nelements; i++) + for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m]; + + // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to + // for setfl files, I,J mapping only depends on I + // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used + + for (i = 1; i <= ntypes; i++) + for (j = 1; j <= ntypes; j++) + type2rhor[i][j] = map[i]; + + // ------------------------------------------------------------------ + // setup z2r arrays + // ------------------------------------------------------------------ + + // allocate z2r arrays + // nz2r = N*(N+1)/2 where N = # of setfl elements + + nz2r = setfl->nelements * (setfl->nelements+1) / 2; + memory->destroy(z2r); + memory->create(z2r,nz2r,nr+1,"pair:z2r"); + + // copy each element pair z2r to global z2r, only for I >= J + + n = 0; + for (i = 0; i < setfl->nelements; i++) + for (j = 0; j <= i; j++) { + for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m]; + n++; + } + + // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to + // set of z2r arrays only fill lower triangular Nelement matrix + // value = n = sum over rows of lower-triangular matrix until reach irow,icol + // swap indices when irow < icol to stay lower triangular + // if map = -1 (non-EAM atom in pair hybrid): + // type2z2r is not used by non-opt + // but set type2z2r to 0 since accessed by opt + + int irow,icol; + for (i = 1; i <= ntypes; i++) { + for (j = 1; j <= ntypes; j++) { + irow = map[i]; + icol = map[j]; + if (irow == -1 || icol == -1) { + type2z2r[i][j] = 0; + continue; + } + if (irow < icol) { + irow = map[j]; + icol = map[i]; + } + n = 0; + for (m = 0; m < irow; m++) n += m + 1; + n += icol; + type2z2r[i][j] = n; + } + } +} diff --git a/src/USER-CUDA/pair_eam_alloy_cuda.h b/src/USER-CUDA/pair_eam_alloy_cuda.h new file mode 100644 index 0000000000..d17d9f5c79 --- /dev/null +++ b/src/USER-CUDA/pair_eam_alloy_cuda.h @@ -0,0 +1,44 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(eam/alloy/cuda,PairEAMAlloyCuda) + +#else + +#ifndef LMP_PAIR_EAM_CUDA_ALLOY_H +#define LMP_PAIR_EAM_CUDA_ALLOY_H + +#include "pair_eam_cuda.h" + +namespace LAMMPS_NS { + +// use virtual public since this class is parent in multiple inheritance + +class PairEAMAlloyCuda : virtual public PairEAMCuda { + public: + PairEAMAlloyCuda(class LAMMPS *); + virtual ~PairEAMAlloyCuda() {} + void coeff(int, char **); + + protected: + class Cuda *cuda; + void read_file(char *); + void file2array(); +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_eam_cuda.cpp b/src/USER-CUDA/pair_eam_cuda.cpp new file mode 100644 index 0000000000..0ca7289c6b --- /dev/null +++ b/src/USER-CUDA/pair_eam_cuda.cpp @@ -0,0 +1,239 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_eam_cuda.h" +#include "pair_eam_cuda_cu.h" +#include "pair_virial_compute_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairEAMCuda::PairEAMCuda(LAMMPS *lmp) : PairEAM(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->shared_data.pair.override_block_per_atom = 0; + + cuda->setSystemParams(); + cu_rho=NULL; + cu_fp=NULL; + cu_frho_spline = NULL; + cu_z2r_spline = NULL; + cu_rhor_spline = NULL; +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairEAMCuda::allocate() +{ + if(! allocated) PairEAM::allocate(); + cuda->shared_data.pair.cutsq = cutsq; + cuda->shared_data.pair.cut_global = (F_FLOAT) cutforcesq; +} + +/* ---------------------------------------------------------------------- */ + +void PairEAMCuda::compute(int eflag, int vflag) +{ + cuda->shared_data.pair.cut_global = (F_FLOAT) cutforcesq; + cuda->shared_data.pair.use_block_per_atom = 0; + cuda->shared_data.pair.collect_forces_later = 0; + if (atom->nmax > nmax) { + memory->destroy(rho); + memory->destroy(fp); + nmax = atom->nmax; + memory->create(rho,nmax,"pair:rho"); + memory->create(fp,nmax,"pair:fp"); + delete cu_rho; + delete cu_fp; + cu_rho = new cCudaData (rho, atom->nmax); + cu_fp = new cCudaData (fp, atom->nmax); + Cuda_PairEAMCuda_Init(&cuda->shared_data,rdr,rdrho,nfrho,nrhor,nr,nrho,nz2r, + cu_frho_spline->dev_data(),cu_rhor_spline->dev_data(),cu_z2r_spline->dev_data(), + cu_rho->dev_data(),cu_fp->dev_data(),type2frho,type2z2r,type2rhor); + } + + + + if(eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairEAM1Cuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag,eflag_atom,vflag_atom); + comm->forward_comm_pair(this); + Cuda_PairEAM2Cuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag,eflag_atom,vflag_atom); + + if(eflag) cuda->cu_eng_vdwl->download(); + if(vflag) cuda->cu_virial->download(); +} + +/* ---------------------------------------------------------------------- */ + +void PairEAMCuda::settings(int narg, char **arg) +{ + PairEAM::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cutforcesq; +} + +/* ---------------------------------------------------------------------- */ + +void PairEAMCuda::coeff(int narg, char **arg) +{ + PairEAM::coeff(narg, arg); + allocate(); +} + +void PairEAMCuda::init_style() +{ + MYDBG(printf("# CUDA PairEAMCuda::init_style start\n"); ) + // request regular or rRESPA neighbor lists + file2array(); + array2spline(); + int irequest; + + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + delete cu_rhor_spline; + delete cu_z2r_spline; + delete cu_frho_spline; + + cu_rhor_spline = new cCudaData((double*)rhor_spline,nrhor,nr+1,EAM_COEFF_LENGTH); + cu_z2r_spline = new cCudaData((double*)z2r_spline,nz2r,nr+1,EAM_COEFF_LENGTH); + cu_frho_spline = new cCudaData((double*)frho_spline,nfrho,nrho+1,EAM_COEFF_LENGTH); + + cu_rhor_spline->upload(); + cu_z2r_spline->upload(); + cu_frho_spline->upload(); + + MYDBG(printf("# CUDA PairEAMCuda::init_style end\n"); ) +} + +void PairEAMCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairEAMCuda::init_list\n");) + PairEAM::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairEAMCuda::init_list end\n");) +} + +void PairEAMCuda::array2spline() +{ + rdr = 1.0/dr; + rdrho = 1.0/drho; + + memory->destroy(frho_spline); + memory->destroy(rhor_spline); + memory->destroy(z2r_spline); + + memory->create(frho_spline,nfrho,nrho+1,7,"pair:frho"); + memory->create(rhor_spline,nrhor,nr+1,7,"pair:rhor"); + memory->create(z2r_spline,nz2r,nr+1,7,"pair:z2r"); + + for (int i = 0; i < nfrho; i++){ + interpolate(nrho,drho,frho[i],frho_spline[i]); + for(int j=0;jshared_data,n,*iswap,buf); + if(sizeof(F_FLOAT)shared_data,n,first,buf,cu_fp->dev_data()); +} + diff --git a/src/USER-CUDA/pair_eam_cuda.h b/src/USER-CUDA/pair_eam_cuda.h new file mode 100644 index 0000000000..e560fabd62 --- /dev/null +++ b/src/USER-CUDA/pair_eam_cuda.h @@ -0,0 +1,78 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ +#ifdef PAIR_CLASS + +PairStyle(eam/cuda,PairEAMCuda) + +#else + +#ifndef PAIR_EAM_CUDA_H +#define PAIR_EAM_CUDA_H + +#include "cuda_data.h" +#include "pair_eam.h" + +namespace LAMMPS_NS { + +class PairEAMCuda : public PairEAM +{ + public: + PairEAMCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void array2spline(); + int pack_comm(int n, int *iswap, double *buf, int pbc_flag, int *pbc); + void unpack_comm(int n, int first, double *buf); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; + cCudaData* cu_rho; + cCudaData* cu_fp; + cCudaData* cu_rhor_spline; + cCudaData* cu_z2r_spline; + cCudaData* cu_frho_spline; + +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_eam_fs_cuda.cpp b/src/USER-CUDA/pair_eam_fs_cuda.cpp new file mode 100644 index 0000000000..56219d4e31 --- /dev/null +++ b/src/USER-CUDA/pair_eam_fs_cuda.cpp @@ -0,0 +1,335 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Tim Lau (MIT) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "pair_eam_fs_cuda.h" +#include "atom.h" +#include "comm.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +#define MAXLINE 1024 + +/* ---------------------------------------------------------------------- */ + +PairEAMFSCuda::PairEAMFSCuda(LAMMPS *lmp) : PairEAMCuda(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + one_coeff = 1; +} + +/* ---------------------------------------------------------------------- + set coeffs for one or more type pairs + read EAM Finnis-Sinclair file +------------------------------------------------------------------------- */ + +void PairEAMFSCuda::coeff(int narg, char **arg) +{ + int i,j; + + if (!allocated) allocate(); + + if (narg != 3 + atom->ntypes) + error->all("Incorrect args for pair coefficients"); + + // insure I,J args are * * + + if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0) + error->all("Incorrect args for pair coefficients"); + + // read EAM Finnis-Sinclair file + + if (fs) { + for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i]; + delete [] fs->elements; + delete [] fs->mass; + memory->destroy(fs->frho); + memory->destroy(fs->rhor); + memory->destroy(fs->z2r); + delete fs; + } + fs = new Fs(); + read_file(arg[2]); + + // read args that map atom types to elements in potential file + // map[i] = which element the Ith atom type is, -1 if NULL + + for (i = 3; i < narg; i++) { + if (strcmp(arg[i],"NULL") == 0) { + map[i-2] = -1; + continue; + } + for (j = 0; j < fs->nelements; j++) + if (strcmp(arg[i],fs->elements[j]) == 0) break; + if (j < fs->nelements) map[i-2] = j; + else error->all("No matching element in EAM potential file"); + } + + // clear setflag since coeff() called once with I,J = * * + + int n = atom->ntypes; + for (i = 1; i <= n; i++) + for (j = i; j <= n; j++) + setflag[i][j] = 0; + + // set setflag i,j for type pairs where both are mapped to elements + // set mass of atom type if i = j + + int count = 0; + for (i = 1; i <= n; i++) { + for (j = i; j <= n; j++) { + if (map[i] >= 0 && map[j] >= 0) { + setflag[i][j] = 1; + if (i == j) atom->set_mass(i,fs->mass[map[i]]); + count++; + } + } + } + + if (count == 0) error->all("Incorrect args for pair coefficients"); +} + +/* ---------------------------------------------------------------------- + read a multi-element DYNAMO setfl file +------------------------------------------------------------------------- */ + +void PairEAMFSCuda::read_file(char *filename) +{ + Fs *file = fs; + + // open potential file + + int me = comm->me; + FILE *fptr; + char line[MAXLINE]; + + if (me == 0) { + fptr = fopen(filename,"r"); + if (fptr == NULL) { + char str[128]; + sprintf(str,"Cannot open EAM potential file %s",filename); + error->one(str); + } + } + + // read and broadcast header + // extract element names from nelements line + + int n; + if (me == 0) { + fgets(line,MAXLINE,fptr); + fgets(line,MAXLINE,fptr); + fgets(line,MAXLINE,fptr); + fgets(line,MAXLINE,fptr); + n = strlen(line) + 1; + } + MPI_Bcast(&n,1,MPI_INT,0,world); + MPI_Bcast(line,n,MPI_CHAR,0,world); + + sscanf(line,"%d",&file->nelements); + int nwords = atom->count_words(line); + if (nwords != file->nelements + 1) + error->all("Incorrect element names in EAM potential file"); + + char **words = new char*[file->nelements+1]; + nwords = 0; + char *first = strtok(line," \t\n\r\f"); + while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue; + + file->elements = new char*[file->nelements]; + for (int i = 0; i < file->nelements; i++) { + n = strlen(words[i]) + 1; + file->elements[i] = new char[n]; + strcpy(file->elements[i],words[i]); + } + delete [] words; + + if (me == 0) { + fgets(line,MAXLINE,fptr); + sscanf(line,"%d %lg %d %lg %lg", + &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut); + } + + MPI_Bcast(&file->nrho,1,MPI_INT,0,world); + MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world); + MPI_Bcast(&file->nr,1,MPI_INT,0,world); + MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world); + MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world); + + file->mass = new double[file->nelements]; + memory->create(file->frho,file->nelements,file->nrho+1, + "pair:frho"); + memory->create(file->rhor,file->nelements,file->nelements, + file->nr+1,"pair:rhor"); + memory->create(file->z2r,file->nelements,file->nelements, + file->nr+1,"pair:z2r"); + int i,j,tmp; + for (i = 0; i < file->nelements; i++) { + if (me == 0) { + fgets(line,MAXLINE,fptr); + sscanf(line,"%d %lg",&tmp,&file->mass[i]); + } + MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world); + + if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]); + MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world); + + for (j = 0; j < file->nelements; j++) { + if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]); + MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world); + } + } + + for (i = 0; i < file->nelements; i++) + for (j = 0; j <= i; j++) { + if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]); + MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world); + } + + // close the potential file + + if (me == 0) fclose(fptr); +} + +/* ---------------------------------------------------------------------- + copy read-in setfl potential to standard array format +------------------------------------------------------------------------- */ + +void PairEAMFSCuda::file2array() +{ + int i,j,m,n; + int ntypes = atom->ntypes; + + // set function params directly from fs file + + nrho = fs->nrho; + nr = fs->nr; + drho = fs->drho; + dr = fs->dr; + + // ------------------------------------------------------------------ + // setup frho arrays + // ------------------------------------------------------------------ + + // allocate frho arrays + // nfrho = # of fs elements + 1 for zero array + + nfrho = fs->nelements + 1; + memory->destroy(frho); + memory->create(frho,nfrho,nrho+1,"pair:frho"); + + // copy each element's frho to global frho + + for (i = 0; i < fs->nelements; i++) + for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m]; + + // add extra frho of zeroes for non-EAM types to point to (pair hybrid) + // this is necessary b/c fp is still computed for non-EAM atoms + + for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0; + + // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to + // if atom type doesn't point to element (non-EAM atom in pair hybrid) + // then map it to last frho array of zeroes + + for (i = 1; i <= ntypes; i++) + if (map[i] >= 0) type2frho[i] = map[i]; + else type2frho[i] = nfrho-1; + + // ------------------------------------------------------------------ + // setup rhor arrays + // ------------------------------------------------------------------ + + // allocate rhor arrays + // nrhor = square of # of fs elements + + nrhor = fs->nelements * fs->nelements; + memory->destroy(rhor); + memory->create(rhor,nrhor,nr+1,"pair:rhor"); + + // copy each element pair rhor to global rhor + + n = 0; + for (i = 0; i < fs->nelements; i++) + for (j = 0; j < fs->nelements; j++) { + for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m]; + n++; + } + + // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to + // for fs files, there is a full NxN set of rhor arrays + // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used + + for (i = 1; i <= ntypes; i++) + for (j = 1; j <= ntypes; j++) + type2rhor[i][j] = map[i] * fs->nelements + map[j]; + + // ------------------------------------------------------------------ + // setup z2r arrays + // ------------------------------------------------------------------ + + // allocate z2r arrays + // nz2r = N*(N+1)/2 where N = # of fs elements + + nz2r = fs->nelements * (fs->nelements+1) / 2; + memory->destroy(z2r); + memory->create(z2r,nz2r,nr+1,"pair:z2r"); + + // copy each element pair z2r to global z2r, only for I >= J + + n = 0; + for (i = 0; i < fs->nelements; i++) + for (j = 0; j <= i; j++) { + for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m]; + n++; + } + + // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to + // set of z2r arrays only fill lower triangular Nelement matrix + // value = n = sum over rows of lower-triangular matrix until reach irow,icol + // swap indices when irow < icol to stay lower triangular + // if map = -1 (non-EAM atom in pair hybrid): + // type2z2r is not used by non-opt + // but set type2z2r to 0 since accessed by opt + + int irow,icol; + for (i = 1; i <= ntypes; i++) { + for (j = 1; j <= ntypes; j++) { + irow = map[i]; + icol = map[j]; + if (irow == -1 || icol == -1) { + type2z2r[i][j] = 0; + continue; + } + if (irow < icol) { + irow = map[j]; + icol = map[i]; + } + n = 0; + for (m = 0; m < irow; m++) n += m + 1; + n += icol; + type2z2r[i][j] = n; + } + } +} diff --git a/src/USER-CUDA/pair_eam_fs_cuda.h b/src/USER-CUDA/pair_eam_fs_cuda.h new file mode 100644 index 0000000000..c2d4a5504d --- /dev/null +++ b/src/USER-CUDA/pair_eam_fs_cuda.h @@ -0,0 +1,44 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(eam/fs/cuda,PairEAMFSCuda) + +#else + +#ifndef LMP_PAIR_EAM_FS_CUDA_H +#define LMP_PAIR_EAM_FS_CUDA_H + +#include "pair_eam_cuda.h" + +namespace LAMMPS_NS { + +// use virtual public since this class is parent in multiple inheritance + +class PairEAMFSCuda : virtual public PairEAMCuda { + public: + PairEAMFSCuda(class LAMMPS *); + virtual ~PairEAMFSCuda() {} + void coeff(int, char **); + + protected: + class Cuda *cuda; + void read_file(char *); + void file2array(); +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_gran_hooke_cuda.cpp b/src/USER-CUDA/pair_gran_hooke_cuda.cpp new file mode 100644 index 0000000000..2b46f422fa --- /dev/null +++ b/src/USER-CUDA/pair_gran_hooke_cuda.cpp @@ -0,0 +1,247 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_gran_hooke_cuda.h" +#include "pair_gran_hooke_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "modify.h" +#include "fix_pour.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairGranHookeCuda::PairGranHookeCuda(LAMMPS *lmp) : PairGranHooke(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairGranHookeCuda::allocate() +{ + if(! allocated) PairGranHooke::allocate(); + if(! allocated2) + { + allocated2 = true; + int n = atom->ntypes; + cuda->shared_data.pair.cutsq = cutsq; + memory->create(cuda->shared_data.pair.coeff1,n+1,n+1, + "pair:cuda_coeff1"); + memory->create(cuda->shared_data.pair.coeff2, + n+1,n+1,"pair:cuda_coeff2"); + cuda->shared_data.pair.coeff1[0][0]=kn; + cuda->shared_data.pair.coeff1[0][1]=kt; + cuda->shared_data.pair.coeff1[1][0]=gamman; + cuda->shared_data.pair.coeff1[1][1]=gammat; + cuda->shared_data.pair.coeff2[0][0]=xmu; + cuda->shared_data.pair.coeff2[0][1]=dampflag; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairGranHookeCuda::compute(int eflag, int vflag) +{ + cuda->shared_data.pair.use_block_per_atom = 0; + //cuda->cu_debugdata->memset_device(0); + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairGranHookeCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(vflag) cuda->cu_virial->download(); + } + //cuda->cu_debugdata->download(); + //printf("%lf %lf %lf %lf %lf %lf\n",1.0e-6*cuda->debugdata[0],1.0e-6*cuda->debugdata[1],1.0e-6*cuda->debugdata[2],1.0e-6*cuda->debugdata[3],1.0e-6*cuda->debugdata[4],1.0e-6*cuda->debugdata[5]); + +} + +/* ---------------------------------------------------------------------- */ + +void PairGranHookeCuda::settings(int narg, char **arg) +{ + PairGranHooke::settings(narg, arg); + } + +/* ---------------------------------------------------------------------- */ + +void PairGranHookeCuda::coeff(int narg, char **arg) +{ + PairGranHooke::coeff(narg, arg); + allocate(); +} + +void PairGranHookeCuda::init_style() +{ + int i; + MYDBG(printf("# CUDA PairGranHookeCuda::init_style start\n"); ) + // request regular or rRESPA neighbor lists + + int irequest; + + if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) { + + } + else + { + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->gran = 1; + neighbor->requests[irequest]->cudable = 1; + //neighbor->style=0; //0=NSQ neighboring + } + + if (!atom->radius_flag || !atom->omega_flag || !atom->torque_flag) + error->all("Pair granular requires atom attributes radius, omega, torque"); + if (comm->ghost_velocity == 0) + error->all("Pair granular requires ghost atoms store velocity"); + + // need a half neigh list and optionally a granular history neigh list + + dt = update->dt; + + + + // check for Fix freeze and set freeze_group_bit + + for (i = 0; i < modify->nfix; i++) + if (strcmp(modify->fix[i]->style,"freeze") == 0) break; + if (i < modify->nfix) freeze_group_bit = modify->fix[i]->groupbit; + else freeze_group_bit = 0; + + cuda->shared_data.pair.freeze_group_bit=freeze_group_bit; + // check for Fix pour and set pour_type and pour_maxdiam + + int pour_type = 0; + double pour_maxrad = 0.0; + for (i = 0; i < modify->nfix; i++) + if (strcmp(modify->fix[i]->style,"pour") == 0) break; + if (i < modify->nfix) { + pour_type = ((FixPour *) modify->fix[i])->ntype; + pour_maxrad = ((FixPour *) modify->fix[i])->radius_hi; + } + + // set maxrad_dynamic and maxrad_frozen for each type + // include future Fix pour particles as dynamic + + for (i = 1; i <= atom->ntypes; i++) + onerad_dynamic[i] = onerad_frozen[i] = 0.0; + if (pour_type) onerad_dynamic[pour_type] = pour_maxrad; + + double *radius = atom->radius; + int *mask = atom->mask; + int *type = atom->type; + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++){ + if (mask[i] & freeze_group_bit) + onerad_frozen[type[i]] = MAX(onerad_frozen[type[i]],radius[i]); + else + onerad_dynamic[type[i]] = MAX(onerad_dynamic[type[i]],radius[i]); + } + + MPI_Allreduce(&onerad_dynamic[1],&maxrad_dynamic[1],atom->ntypes, + MPI_DOUBLE,MPI_MAX,world); + MPI_Allreduce(&onerad_frozen[1],&maxrad_frozen[1],atom->ntypes, + MPI_DOUBLE,MPI_MAX,world); + + MYDBG(printf("# CUDA PairGranHookeCuda::init_style end\n"); ) +} + +void PairGranHookeCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairGranHookeCuda::init_list\n");) + PairGranHooke::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairGranHookeCuda::init_list end\n");) +} + +void PairGranHookeCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairGranHooke::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_gran_hooke_cuda.h b/src/USER-CUDA/pair_gran_hooke_cuda.h new file mode 100644 index 0000000000..727082f1f8 --- /dev/null +++ b/src/USER-CUDA/pair_gran_hooke_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(gran/hooke/cuda,PairGranHookeCuda) + +#else + +#ifndef PAIR_GRAN_HOOKE_CUDA_H +#define PAIR_GRAN_HOOKE_CUDA_H + +#include "pair_gran_hooke.h" + +namespace LAMMPS_NS { + +class PairGranHookeCuda : public PairGranHooke +{ + public: + PairGranHookeCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj96_cut_cuda.cpp b/src/USER-CUDA/pair_lj96_cut_cuda.cpp new file mode 100644 index 0000000000..10e43d3278 --- /dev/null +++ b/src/USER-CUDA/pair_lj96_cut_cuda.cpp @@ -0,0 +1,184 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj96_cut_cuda.h" +#include "pair_lj96_cut_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJ96CutCuda::PairLJ96CutCuda(LAMMPS *lmp) : PairLJ96Cut(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJ96CutCuda::allocate() +{ + if(! allocated) PairLJ96Cut::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJ96CutCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairLJ96CutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(vflag) cuda->cu_virial->download(); + } + +} + +/* ---------------------------------------------------------------------- */ + +void PairLJ96CutCuda::settings(int narg, char **arg) +{ + PairLJ96Cut::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJ96CutCuda::coeff(int narg, char **arg) +{ + PairLJ96Cut::coeff(narg, arg); + allocate(); +} + +void PairLJ96CutCuda::init_style() +{ + MYDBG(printf("# CUDA PairLJ96CutCuda::init_style start\n"); ) + // request regular or rRESPA neighbor lists + + int irequest; + + if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) { + + } + else + { + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + //neighbor->style=0; //0=NSQ neighboring + } + + + cut_respa = NULL; + MYDBG(printf("# CUDA PairLJ96CutCuda::init_style end\n"); ) +} + +void PairLJ96CutCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJ96CutCuda::init_list\n");) + PairLJ96Cut::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJ96CutCuda::init_list end\n");) +} + +void PairLJ96CutCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJ96Cut::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj96_cut_cuda.h b/src/USER-CUDA/pair_lj96_cut_cuda.h new file mode 100644 index 0000000000..0abb66f6aa --- /dev/null +++ b/src/USER-CUDA/pair_lj96_cut_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj96/cut/cuda,PairLJ96CutCuda) + +#else + +#ifndef PAIR_LJ96_CUT_CUDA_H +#define PAIR_LJ96_CUT_CUDA_H + +#include "pair_lj96_cut.h" + +namespace LAMMPS_NS { + +class PairLJ96CutCuda : public PairLJ96Cut +{ + public: + PairLJ96CutCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp new file mode 100644 index 0000000000..8e74daf70f --- /dev/null +++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp @@ -0,0 +1,193 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_charmm_coul_charmm_cuda.h" +#include "pair_lj_charmm_coul_charmm_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJCharmmCoulCharmmCuda::PairLJCharmmCoulCharmmCuda(LAMMPS *lmp) : PairLJCharmmCoulCharmm(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->shared_data.pair.use_block_per_atom = 0; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJCharmmCoulCharmmCuda::allocate() +{ + if(! allocated) PairLJCharmmCoulCharmm::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + cu_lj1_gm = new cCudaData ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj2_gm = new cCudaData ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj3_gm = new cCudaData ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj4_gm = new cCudaData ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1)); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulCharmmCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + } + + Cuda_PairLJCharmmCoulCharmmCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,denom_lj,cut_coul_innersq,denom_coul); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulCharmmCuda::settings(int narg, char **arg) +{ + PairLJCharmmCoulCharmm::settings(narg, arg); + cuda->shared_data.pair.cut_global = (X_FLOAT) cut_lj; + cuda->shared_data.pair.cut_coulsq_global = (X_FLOAT) cut_coulsq; + cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_lj_inner; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulCharmmCuda::coeff(int narg, char **arg) +{ + PairLJCharmmCoulCharmm::coeff(narg, arg); + allocate(); +} + +void PairLJCharmmCoulCharmmCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style lj/charmm/coul/long requires atom attribute q"); + // request regular or rRESPA neighbor lists + + if(atom->molecular) + { + cuda->shared_data.pair.collect_forces_later = 1; + } + + int irequest; + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + if (cut_lj_inner >= cut_lj || cut_coul_inner >= cut_coul) + error->all("Pair inner cutoff >= Pair outer cutoff"); + + cut_lj_innersq = cut_lj_inner * cut_lj_inner; + cut_ljsq = cut_lj * cut_lj; + cut_coul_innersq = cut_coul_inner * cut_coul_inner; + cut_coulsq = cut_coul * cut_coul; + cut_bothsq = MAX(cut_ljsq,cut_coulsq); + + denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * + (cut_ljsq-cut_lj_innersq); + denom_coul = (cut_coulsq-cut_coul_innersq) * (cut_coulsq-cut_coul_innersq) * + (cut_coulsq-cut_coul_innersq); + + cut_coulsq = cut_coul * cut_coul; + + cuda->shared_data.pair.cut_coulsq_global=cut_coulsq; + + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; +} + +void PairLJCharmmCoulCharmmCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJCharmmCoulCharmmCuda::init_list\n");) + PairLJCharmmCoulCharmm::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJCharmmCoulCharmmCuda::init_list end\n");) +} + +void PairLJCharmmCoulCharmmCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJCharmmCoulCharmm::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.h b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.h new file mode 100644 index 0000000000..39ec4735ef --- /dev/null +++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.h @@ -0,0 +1,63 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/charmm/coul/charmm/cuda,PairLJCharmmCoulCharmmCuda) + +#else + +#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_CUDA_H +#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_CUDA_H + +#include "pair_lj_charmm_coul_charmm.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class PairLJCharmmCoulCharmmCuda : public PairLJCharmmCoulCharmm +{ + public: + PairLJCharmmCoulCharmmCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; + cCudaData* cu_lj1_gm; + cCudaData* cu_lj2_gm; + cCudaData* cu_lj3_gm; + cCudaData* cu_lj4_gm; + +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp new file mode 100644 index 0000000000..9a4bed09eb --- /dev/null +++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp @@ -0,0 +1,188 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_charmm_coul_charmm_implicit_cuda.h" +#include "pair_lj_charmm_coul_charmm_implicit_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJCharmmCoulCharmmImplicitCuda::PairLJCharmmCoulCharmmImplicitCuda(LAMMPS *lmp) : PairLJCharmmCoulCharmmImplicit(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->shared_data.pair.collect_forces_later = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJCharmmCoulCharmmImplicitCuda::allocate() +{ + if(! allocated) PairLJCharmmCoulCharmmImplicit::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + cu_lj1_gm = new cCudaData ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj2_gm = new cCudaData ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj3_gm = new cCudaData ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj4_gm = new cCudaData ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1)); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulCharmmImplicitCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + } + + Cuda_PairLJCharmmCoulCharmmImplicitCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,denom_lj,cut_coul_innersq,denom_coul); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulCharmmImplicitCuda::settings(int narg, char **arg) +{ + PairLJCharmmCoulCharmmImplicit::settings(narg, arg); + cuda->shared_data.pair.cut_global = (X_FLOAT) cut_lj; + cuda->shared_data.pair.cut_coulsq_global = (X_FLOAT) cut_coulsq; + cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_lj_inner; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulCharmmImplicitCuda::coeff(int narg, char **arg) +{ + PairLJCharmmCoulCharmmImplicit::coeff(narg, arg); + allocate(); +} + +void PairLJCharmmCoulCharmmImplicitCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style lj/charmm/coul/long requires atom attribute q"); + // request regular or rRESPA neighbor lists + + int irequest; + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + if (cut_lj_inner >= cut_lj || cut_coul_inner >= cut_coul) + error->all("Pair inner cutoff >= Pair outer cutoff"); + + cut_lj_innersq = cut_lj_inner * cut_lj_inner; + cut_ljsq = cut_lj * cut_lj; + cut_coul_innersq = cut_coul_inner * cut_coul_inner; + cut_coulsq = cut_coul * cut_coul; + cut_bothsq = MAX(cut_ljsq,cut_coulsq); + + denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * + (cut_ljsq-cut_lj_innersq); + denom_coul = (cut_coulsq-cut_coul_innersq) * (cut_coulsq-cut_coul_innersq) * + (cut_coulsq-cut_coul_innersq); + + cut_coulsq = cut_coul * cut_coul; + + cuda->shared_data.pair.cut_coulsq_global=cut_coulsq; + + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; +} + +void PairLJCharmmCoulCharmmImplicitCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJCharmmCoulCharmmImplicitCuda::init_list\n");) + PairLJCharmmCoulCharmmImplicit::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJCharmmCoulCharmmImplicitCuda::init_list end\n");) +} + +void PairLJCharmmCoulCharmmImplicitCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJCharmmCoulCharmmImplicit::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.h b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.h new file mode 100644 index 0000000000..94d8d09543 --- /dev/null +++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.h @@ -0,0 +1,62 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/charmm/coul/charmm/implicit/cuda,PairLJCharmmCoulCharmmImplicitCuda) + +#else + +#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_IMPLICIT_CUDA_H +#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_IMPLICIT_CUDA_H + +#include "pair_lj_charmm_coul_charmm_implicit.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class PairLJCharmmCoulCharmmImplicitCuda : public PairLJCharmmCoulCharmmImplicit +{ + public: + PairLJCharmmCoulCharmmImplicitCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; + cCudaData* cu_lj1_gm; + cCudaData* cu_lj2_gm; + cCudaData* cu_lj3_gm; + cCudaData* cu_lj4_gm; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp new file mode 100644 index 0000000000..4ba45efd54 --- /dev/null +++ b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp @@ -0,0 +1,201 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_charmm_coul_long_cuda.h" +#include "pair_lj_charmm_coul_long_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 +/* ---------------------------------------------------------------------- */ + +PairLJCharmmCoulLongCuda::PairLJCharmmCoulLongCuda(LAMMPS *lmp) : PairLJCharmmCoulLong(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->shared_data.pair.collect_forces_later = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJCharmmCoulLongCuda::allocate() +{ + if(! allocated) PairLJCharmmCoulLong::allocate(); + if(! allocated2) + { + allocated2 = true; + //cuda->shared_data.pair.cut = cut_lj; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + cu_lj1_gm = new cCudaData ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj2_gm = new cCudaData ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj3_gm = new cCudaData ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj4_gm = new cCudaData ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1)); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulLongCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + } + + Cuda_PairLJCharmmCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,denom_lj); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulLongCuda::settings(int narg, char **arg) +{ + PairLJCharmmCoulLong::settings(narg, arg); + cuda->shared_data.pair.cut_global = (X_FLOAT) cut_lj; + cuda->shared_data.pair.cut_coulsq_global = (X_FLOAT) cut_coulsq; + cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_lj_inner; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulLongCuda::coeff(int narg, char **arg) +{ + PairLJCharmmCoulLong::coeff(narg, arg); + allocate(); +} + +void PairLJCharmmCoulLongCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style lj/charmm/coul/long requires atom attribute q"); + // request regular or rRESPA neighbor lists + + int irequest; + + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + if (cut_lj_inner >= cut_lj) + error->all("Pair inner cutoff >= Pair outer cutoff"); + + cut_lj_innersq = cut_lj_inner * cut_lj_inner; + cut_ljsq = cut_lj * cut_lj; + cut_coulsq = cut_coul * cut_coul; + cut_bothsq = MAX(cut_ljsq,cut_coulsq); + + denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * + (cut_ljsq-cut_lj_innersq); + + cut_coulsq = cut_coul * cut_coul; + cuda->shared_data.pair.cut_coulsq_global=cut_coulsq; + + if (force->kspace == NULL) + error->all("Pair style is incompatible with KSpace style"); + g_ewald = force->kspace->g_ewald; + cuda->shared_data.pair.g_ewald=g_ewald; + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + + + if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n"); +} + +void PairLJCharmmCoulLongCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJCharmmCoulLongCuda::init_list\n");) + PairLJCharmmCoulLong::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJCharmmCoulLongCuda::init_list end\n");) +} + +void PairLJCharmmCoulLongCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJCharmmCoulLong::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h new file mode 100644 index 0000000000..4548883aaa --- /dev/null +++ b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h @@ -0,0 +1,62 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/charmm/coul/long/cuda,PairLJCharmmCoulLongCuda) + +#else + +#ifndef LMP_PAIR_LJ_CHARMM_COUL_LONG_CUDA_H +#define LMP_PAIR_LJ_CHARMM_COUL_LONG_CUDA_H + +#include "pair_lj_charmm_coul_long.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class PairLJCharmmCoulLongCuda : public PairLJCharmmCoulLong +{ + public: + PairLJCharmmCoulLongCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; + cCudaData* cu_lj1_gm; + cCudaData* cu_lj2_gm; + cCudaData* cu_lj3_gm; + cCudaData* cu_lj4_gm; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.cpp b/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.cpp new file mode 100644 index 0000000000..6ef1e7116a --- /dev/null +++ b/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.cpp @@ -0,0 +1,167 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_class2_coul_cut_cuda.h" +#include "pair_lj_class2_coul_cut_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJClass2CoulCutCuda::PairLJClass2CoulCutCuda(LAMMPS *lmp) : PairLJClass2CoulCut(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJClass2CoulCutCuda::allocate() +{ + if(! allocated) PairLJClass2CoulCut::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut_lj; + cuda->shared_data.pair.cut_coul= cut_coul; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJClass2CoulCutCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairLJClass2CoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJClass2CoulCutCuda::settings(int narg, char **arg) +{ + PairLJClass2CoulCut::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global; + cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJClass2CoulCutCuda::coeff(int narg, char **arg) +{ + PairLJClass2CoulCut::coeff(narg, arg); + allocate(); +} + +void PairLJClass2CoulCutCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style lj/cut/coul/cut/cuda requires atom attribute q"); + // request regular or rRESPA neighbor lists + + int irequest; + + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + +} + +void PairLJClass2CoulCutCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJClass2CoulCutCuda::init_list\n");) + PairLJClass2CoulCut::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJClass2CoulCutCuda::init_list end\n");) +} + +void PairLJClass2CoulCutCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJClass2CoulCut::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h b/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h new file mode 100644 index 0000000000..e9edf9839b --- /dev/null +++ b/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/class2/coul/cut/cuda,PairLJClass2CoulCutCuda) + +#else + +#ifndef LMP_PAIR_LJ_CLASS2_COUL_CUT_CUDA_H +#define LMP_PAIR_LJ_CLASS2_COUL_CUT_CUDA_H + +#include "pair_lj_class2_coul_cut.h" + +namespace LAMMPS_NS { + +class PairLJClass2CoulCutCuda : public PairLJClass2CoulCut +{ + public: + PairLJClass2CoulCutCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_class2_coul_long_cuda.cpp b/src/USER-CUDA/pair_lj_class2_coul_long_cuda.cpp new file mode 100644 index 0000000000..6cf036e300 --- /dev/null +++ b/src/USER-CUDA/pair_lj_class2_coul_long_cuda.cpp @@ -0,0 +1,180 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_class2_coul_long_cuda.h" +#include "pair_lj_class2_coul_long_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 +/* ---------------------------------------------------------------------- */ + +PairLJClass2CoulLongCuda::PairLJClass2CoulLongCuda(LAMMPS *lmp) : PairLJClass2CoulLong(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJClass2CoulLongCuda::allocate() +{ + if(! allocated) PairLJClass2CoulLong::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut_lj; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJClass2CoulLongCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairLJClass2CoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJClass2CoulLongCuda::settings(int narg, char **arg) +{ + PairLJClass2CoulLong::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJClass2CoulLongCuda::coeff(int narg, char **arg) +{ + PairLJClass2CoulLong::coeff(narg, arg); + allocate(); +} + +void PairLJClass2CoulLongCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style lj/cut/coul/long requires atom attribute q"); + // request regular or rRESPA neighbor lists + + int irequest; + + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + cut_coulsq = cut_coul * cut_coul; + cuda->shared_data.pair.cut_coul_global=cut_coul; + cuda->shared_data.pair.cut_coulsq_global=cut_coulsq; + // set rRESPA cutoffs + + if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster."); + if (force->kspace == NULL) + error->all("Pair style is incompatible with KSpace style"); + g_ewald = force->kspace->g_ewald; + cuda->shared_data.pair.g_ewald=g_ewald; + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + + + if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n"); +} + +void PairLJClass2CoulLongCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJClass2CoulLongCuda::init_list\n");) + PairLJClass2CoulLong::init_list(id, ptr); + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + MYDBG(printf("# CUDA PairLJClass2CoulLongCuda::init_list end\n");) +} + +void PairLJClass2CoulLongCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJClass2CoulLong::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_class2_coul_long_cuda.h b/src/USER-CUDA/pair_lj_class2_coul_long_cuda.h new file mode 100644 index 0000000000..6bf4a71e16 --- /dev/null +++ b/src/USER-CUDA/pair_lj_class2_coul_long_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/class2/coul/long/cuda,PairLJClass2CoulLongCuda) + +#else + +#ifndef LMP_PAIR_LJ_CLASS2_COUL_LONG_CUDA_H +#define LMP_PAIR_LJ_CLASS2_COUL_LONG_CUDA_H + +#include "pair_lj_class2_coul_long.h" + +namespace LAMMPS_NS { + +class PairLJClass2CoulLongCuda : public PairLJClass2CoulLong +{ + public: + PairLJClass2CoulLongCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_class2_cuda.cpp b/src/USER-CUDA/pair_lj_class2_cuda.cpp new file mode 100644 index 0000000000..0d253c940a --- /dev/null +++ b/src/USER-CUDA/pair_lj_class2_cuda.cpp @@ -0,0 +1,172 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_class2_cuda.h" +#include "pair_lj_class2_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJClass2Cuda::PairLJClass2Cuda(LAMMPS *lmp) : PairLJClass2(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJClass2Cuda::allocate() +{ + if(! allocated) PairLJClass2::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJClass2Cuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairLJClass2Cuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(vflag) cuda->cu_virial->download(); + } + +} + +/* ---------------------------------------------------------------------- */ + +void PairLJClass2Cuda::settings(int narg, char **arg) +{ + PairLJClass2::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJClass2Cuda::coeff(int narg, char **arg) +{ + PairLJClass2::coeff(narg, arg); + allocate(); +} + +void PairLJClass2Cuda::init_style() +{ + MYDBG(printf("# CUDA PairLJClass2Cuda::init_style start\n"); ) + // request regular or rRESPA neighbor lists + + int irequest; + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + //neighbor->style=0; //0=NSQ neighboring + MYDBG(printf("# CUDA PairLJClass2Cuda::init_style end\n"); ) +} + +void PairLJClass2Cuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJClass2Cuda::init_list\n");) + PairLJClass2::init_list(id, ptr); + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + MYDBG(printf("# CUDA PairLJClass2Cuda::init_list end\n");) +} + +void PairLJClass2Cuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJClass2::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_class2_cuda.h b/src/USER-CUDA/pair_lj_class2_cuda.h new file mode 100644 index 0000000000..8643ad94aa --- /dev/null +++ b/src/USER-CUDA/pair_lj_class2_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/class2/cuda,PairLJClass2Cuda) + +#else + +#ifndef PAIR_LJ_CLASS2_CUDA_H +#define PAIR_LJ_CLASS2_CUDA_H + +#include "pair_lj_class2.h" + +namespace LAMMPS_NS { + +class PairLJClass2Cuda : public PairLJClass2 +{ + public: + PairLJClass2Cuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp b/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp new file mode 100644 index 0000000000..399d8c6758 --- /dev/null +++ b/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp @@ -0,0 +1,167 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_cut_coul_cut_cuda.h" +#include "pair_lj_cut_coul_cut_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulCutCuda::PairLJCutCoulCutCuda(LAMMPS *lmp) : PairLJCutCoulCut(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJCutCoulCutCuda::allocate() +{ + if(! allocated) PairLJCutCoulCut::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut_lj; + cuda->shared_data.pair.cut_coul= cut_coul; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulCutCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairLJCutCoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulCutCuda::settings(int narg, char **arg) +{ + PairLJCutCoulCut::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global; + cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulCutCuda::coeff(int narg, char **arg) +{ + PairLJCutCoulCut::coeff(narg, arg); + allocate(); +} + +void PairLJCutCoulCutCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style lj/cut/coul/cut/cuda requires atom attribute q"); + // request regular or rRESPA neighbor lists + + int irequest; + + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + +} + +void PairLJCutCoulCutCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJCutCoulCutCuda::init_list\n");) + PairLJCutCoulCut::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJCutCoulCutCuda::init_list end\n");) +} + +void PairLJCutCoulCutCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJCutCoulCut::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h b/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h new file mode 100644 index 0000000000..130140d6ce --- /dev/null +++ b/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/cut/coul/cut/cuda,PairLJCutCoulCutCuda) + +#else + +#ifndef LMP_PAIR_LJ_CUT_COUL_CUT_CUDA_H +#define LMP_PAIR_LJ_CUT_COUL_CUT_CUDA_H + +#include "pair_lj_cut_coul_cut.h" + +namespace LAMMPS_NS { + +class PairLJCutCoulCutCuda : public PairLJCutCoulCut +{ + public: + PairLJCutCoulCutCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp b/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp new file mode 100644 index 0000000000..dd745ca7da --- /dev/null +++ b/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp @@ -0,0 +1,168 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_cut_coul_debye_cuda.h" +#include "pair_lj_cut_coul_debye_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulDebyeCuda::PairLJCutCoulDebyeCuda(LAMMPS *lmp) : PairLJCutCoulDebye(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJCutCoulDebyeCuda::allocate() +{ + if(! allocated) PairLJCutCoulDebye::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut_lj; + cuda->shared_data.pair.cut_coul= cut_coul; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulDebyeCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairLJCutCoulDebyeCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulDebyeCuda::settings(int narg, char **arg) +{ + PairLJCutCoulDebye::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global; + cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global; + cuda->shared_data.pair.kappa = (F_FLOAT) kappa; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulDebyeCuda::coeff(int narg, char **arg) +{ + PairLJCutCoulDebye::coeff(narg, arg); + allocate(); +} + +void PairLJCutCoulDebyeCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style lj/cut/coul/debye/cuda requires atom attribute q"); + // request regular or rRESPA neighbor lists + + int irequest; + + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + +} + +void PairLJCutCoulDebyeCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJCutCoulDebyeCuda::init_list\n");) + PairLJCutCoulDebye::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJCutCoulDebyeCuda::init_list end\n");) +} + +void PairLJCutCoulDebyeCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJCutCoulDebye::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.h b/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.h new file mode 100644 index 0000000000..853c428143 --- /dev/null +++ b/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/cut/coul/debye/cuda,PairLJCutCoulDebyeCuda) + +#else + +#ifndef LMP_PAIR_LJ_CUT_COUL_DEBYE_CUDA_H +#define LMP_PAIR_LJ_CUT_COUL_DEBYE_CUDA_H + +#include "pair_lj_cut_coul_debye.h" + +namespace LAMMPS_NS { + +class PairLJCutCoulDebyeCuda : public PairLJCutCoulDebye +{ + public: + PairLJCutCoulDebyeCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_cut_coul_long_cuda.cpp b/src/USER-CUDA/pair_lj_cut_coul_long_cuda.cpp new file mode 100644 index 0000000000..53e65182a5 --- /dev/null +++ b/src/USER-CUDA/pair_lj_cut_coul_long_cuda.cpp @@ -0,0 +1,221 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_cut_coul_long_cuda.h" +#include "pair_lj_cut_coul_long_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulLongCuda::PairLJCutCoulLongCuda(LAMMPS *lmp) : PairLJCutCoulLong(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJCutCoulLongCuda::allocate() +{ + if(! allocated) PairLJCutCoulLong::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut_lj; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairLJCutCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongCuda::settings(int narg, char **arg) +{ + PairLJCutCoulLong::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongCuda::coeff(int narg, char **arg) +{ + PairLJCutCoulLong::coeff(narg, arg); + allocate(); +} + +void PairLJCutCoulLongCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style lj/cut/coul/long requires atom attribute q"); + // request regular or rRESPA neighbor lists + + int irequest; + + if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) { + int respa = 0; + if (((Respa *) update->integrate)->level_inner >= 0) respa = 1; + if (((Respa *) update->integrate)->level_middle >= 0) respa = 2; + + if (respa == 0) irequest = neighbor->request(this); + else if (respa == 1) { + irequest = neighbor->request(this); + neighbor->requests[irequest]->id = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->respainner = 1; + irequest = neighbor->request(this); + neighbor->requests[irequest]->id = 3; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->respaouter = 1; + } else { + irequest = neighbor->request(this); + neighbor->requests[irequest]->id = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->respainner = 1; + irequest = neighbor->request(this); + neighbor->requests[irequest]->id = 2; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->respamiddle = 1; + irequest = neighbor->request(this); + neighbor->requests[irequest]->id = 3; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->respaouter = 1; + } + + } + else + { + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + } + + cut_coulsq = cut_coul * cut_coul; + cuda->shared_data.pair.cut_coul_global=cut_coul; + cuda->shared_data.pair.cut_coulsq_global=cut_coulsq; + // set rRESPA cutoffs + + if (strcmp(update->integrate_style,"respa") == 0 && + ((Respa *) update->integrate)->level_inner >= 0) + cut_respa = ((Respa *) update->integrate)->cutoff; + else cut_respa = NULL; + + if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster."); + if (force->kspace == NULL) + error->all("Pair style is incompatible with KSpace style"); + g_ewald = force->kspace->g_ewald; + cuda->shared_data.pair.g_ewald=g_ewald; + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; + + + if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n"); +} + +void PairLJCutCoulLongCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJCutCoulLongCuda::init_list\n");) + PairLJCutCoulLong::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJCutCoulLongCuda::init_list end\n");) +} + +void PairLJCutCoulLongCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJCutCoulLong::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_cut_coul_long_cuda.h b/src/USER-CUDA/pair_lj_cut_coul_long_cuda.h new file mode 100644 index 0000000000..2f14357408 --- /dev/null +++ b/src/USER-CUDA/pair_lj_cut_coul_long_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/cut/coul/long/cuda,PairLJCutCoulLongCuda) + +#else + +#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_CUDA_H +#define LMP_PAIR_LJ_CUT_COUL_LONG_CUDA_H + +#include "pair_lj_cut_coul_long.h" + +namespace LAMMPS_NS { + +class PairLJCutCoulLongCuda : public PairLJCutCoulLong +{ + public: + PairLJCutCoulLongCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_cut_cuda.cpp b/src/USER-CUDA/pair_lj_cut_cuda.cpp new file mode 100644 index 0000000000..d63134c3c3 --- /dev/null +++ b/src/USER-CUDA/pair_lj_cut_cuda.cpp @@ -0,0 +1,184 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_cut_cuda.h" +#include "pair_lj_cut_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJCutCuda::PairLJCutCuda(LAMMPS *lmp) : PairLJCut(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJCutCuda::allocate() +{ + if(! allocated) PairLJCut::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairLJCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(vflag) cuda->cu_virial->download(); + } + +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCuda::settings(int narg, char **arg) +{ + PairLJCut::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCuda::coeff(int narg, char **arg) +{ + PairLJCut::coeff(narg, arg); + allocate(); +} + +void PairLJCutCuda::init_style() +{ + MYDBG(printf("# CUDA PairLJCutCuda::init_style start\n"); ) + // request regular or rRESPA neighbor lists + + int irequest; + + if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) { + + } + else + { + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + //neighbor->style=0; //0=NSQ neighboring + } + + + cut_respa = NULL; + MYDBG(printf("# CUDA PairLJCutCuda::init_style end\n"); ) +} + +void PairLJCutCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJCutCuda::init_list\n");) + PairLJCut::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJCutCuda::init_list end\n");) +} + +void PairLJCutCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJCut::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_cut_cuda.h b/src/USER-CUDA/pair_lj_cut_cuda.h new file mode 100644 index 0000000000..f81d47952d --- /dev/null +++ b/src/USER-CUDA/pair_lj_cut_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/cut/cuda,PairLJCutCuda) + +#else + +#ifndef PAIR_LJ_CUT_CUDA_H +#define PAIR_LJ_CUT_CUDA_H + +#include "pair_lj_cut.h" + +namespace LAMMPS_NS { + +class PairLJCutCuda : public PairLJCut +{ + public: + PairLJCutCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_cut_experimental_cuda.cpp b/src/USER-CUDA/pair_lj_cut_experimental_cuda.cpp new file mode 100644 index 0000000000..029ce05151 --- /dev/null +++ b/src/USER-CUDA/pair_lj_cut_experimental_cuda.cpp @@ -0,0 +1,183 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_cut_experimental_cuda.h" +#include "pair_lj_cut_experimental_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJCutExperimentalCuda::PairLJCutExperimentalCuda(LAMMPS *lmp) : PairLJCut(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJCutExperimentalCuda::allocate() +{ + if(! allocated) PairLJCut::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutExperimentalCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(vflag) cuda->cu_virial->upload(); + Cuda_PairLJCutExperimentalCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + CudaWrapper_Sync(); + if(eflag) cuda->cu_eng_vdwl->download(); + if(vflag) cuda->cu_virial->download(); + } + } + +/* ---------------------------------------------------------------------- */ + +void PairLJCutExperimentalCuda::settings(int narg, char **arg) +{ + PairLJCut::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutExperimentalCuda::coeff(int narg, char **arg) +{ + PairLJCut::coeff(narg, arg); + allocate(); +} + +void PairLJCutExperimentalCuda::init_style() +{ + MYDBG(printf("# CUDA PairLJCutExperimentalCuda::init_style start\n"); ) + // request regular or rRESPA neighbor lists + + int irequest; + + if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) { + + } + else + { + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + //neighbor->style=0; //0=NSQ neighboring + } + + + cut_respa = NULL; + MYDBG(printf("# CUDA PairLJCutExperimentalCuda::init_style end\n"); ) +} + +void PairLJCutExperimentalCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJCutExperimentalCuda::init_list\n");) + PairLJCut::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJCutExperimentalCuda::init_list end\n");) +} + +void PairLJCutExperimentalCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJCut::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_cut_experimental_cuda.h b/src/USER-CUDA/pair_lj_cut_experimental_cuda.h new file mode 100644 index 0000000000..9deb686524 --- /dev/null +++ b/src/USER-CUDA/pair_lj_cut_experimental_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/cut/experimental/cuda,PairLJCutExperimentalCuda) + +#else + +#ifndef PAIR_LJ_CUT_EXPERIMENTAL_CUDA_H +#define PAIR_LJ_CUT_EXPERIMENTAL_CUDA_H + +#include "pair_lj_cut.h" + +namespace LAMMPS_NS { + +class PairLJCutExperimentalCuda : public PairLJCut +{ + public: + PairLJCutExperimentalCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_expand_cuda.cpp b/src/USER-CUDA/pair_lj_expand_cuda.cpp new file mode 100644 index 0000000000..a9f2a6561f --- /dev/null +++ b/src/USER-CUDA/pair_lj_expand_cuda.cpp @@ -0,0 +1,185 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_expand_cuda.h" +#include "pair_lj_expand_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJExpandCuda::PairLJExpandCuda(LAMMPS *lmp) : PairLJExpand(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJExpandCuda::allocate() +{ + if(! allocated) PairLJExpand::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut; + cuda->shared_data.pair.cutsq = cutsq; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.coeff5 = shift; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJExpandCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairLJExpandCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(vflag) cuda->cu_virial->download(); + } + +} + +/* ---------------------------------------------------------------------- */ + +void PairLJExpandCuda::settings(int narg, char **arg) +{ + PairLJExpand::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJExpandCuda::coeff(int narg, char **arg) +{ + PairLJExpand::coeff(narg, arg); + allocate(); +} + +void PairLJExpandCuda::init_style() +{ + MYDBG(printf("# CUDA PairLJExpandCuda::init_style start\n"); ) + // request regular or rRESPA neighbor lists + + int irequest; + + if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) { + + } + else + { + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + //neighbor->style=0; //0=NSQ neighboring + } + + + MYDBG(printf("# CUDA PairLJExpandCuda::init_style end\n"); ) +} + +void PairLJExpandCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJExpandCuda::init_list\n");) + PairLJExpand::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJExpandCuda::init_list end\n");) +} + +void PairLJExpandCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJExpand::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_expand_cuda.h b/src/USER-CUDA/pair_lj_expand_cuda.h new file mode 100644 index 0000000000..67d1030edb --- /dev/null +++ b/src/USER-CUDA/pair_lj_expand_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/expand/cuda,PairLJExpandCuda) + +#else + +#ifndef PAIR_LJ_EXPAND_CUDA_H +#define PAIR_LJ_EXPAND_CUDA_H + +#include "pair_lj_expand.h" + +namespace LAMMPS_NS { + +class PairLJExpandCuda : public PairLJExpand +{ + public: + PairLJExpandCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp b/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp new file mode 100644 index 0000000000..e3ead377ca --- /dev/null +++ b/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp @@ -0,0 +1,199 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_gromacs_coul_gromacs_cuda.h" +#include "pair_lj_gromacs_coul_gromacs_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJGromacsCoulGromacsCuda::PairLJGromacsCoulGromacsCuda(LAMMPS *lmp) : PairLJGromacsCoulGromacs(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->shared_data.pair.use_block_per_atom = 0; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJGromacsCoulGromacsCuda::allocate() +{ + if(! allocated) PairLJGromacsCoulGromacs::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.coeff5 = ljsw1; + cuda->shared_data.pair.coeff6 = ljsw2; + cuda->shared_data.pair.coeff7 = ljsw3; + cuda->shared_data.pair.coeff8 = ljsw4; + cuda->shared_data.pair.coeff9 = ljsw5; + cuda->shared_data.pair.special_lj = force->special_lj; + cuda->shared_data.pair.special_coul = force->special_coul; + cu_lj1_gm = new cCudaData ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj2_gm = new cCudaData ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj3_gm = new cCudaData ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj4_gm = new cCudaData ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw1_gm = new cCudaData ((double*)ljsw1, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw2_gm = new cCudaData ((double*)ljsw2, &cuda->shared_data.pair.coeff6_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw3_gm = new cCudaData ((double*)ljsw3, &cuda->shared_data.pair.coeff7_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw4_gm = new cCudaData ((double*)ljsw4, &cuda->shared_data.pair.coeff8_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw5_gm = new cCudaData ((double*)ljsw5, &cuda->shared_data.pair.coeff9_gm, (atom->ntypes+1)*(atom->ntypes+1)); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJGromacsCoulGromacsCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->upload(); + if(eflag) cuda->cu_eng_coul->upload(); + if(vflag) cuda->cu_virial->upload(); + } + + Cuda_PairLJGromacsCoulGromacsCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,cut_coul_inner,coulsw1,coulsw2,coulsw5); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(eflag) cuda->cu_eng_coul->download(); + if(vflag) cuda->cu_virial->download(); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJGromacsCoulGromacsCuda::settings(int narg, char **arg) +{ + PairLJGromacsCoulGromacs::settings(narg, arg); + cuda->shared_data.pair.cut_global = (X_FLOAT) cut_lj; + cuda->shared_data.pair.cut_coulsq_global = (X_FLOAT) cut_coulsq; + cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_lj_inner; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJGromacsCoulGromacsCuda::coeff(int narg, char **arg) +{ + PairLJGromacsCoulGromacs::coeff(narg, arg); + allocate(); +} + +void PairLJGromacsCoulGromacsCuda::init_style() +{ + if (!atom->q_flag) + error->all("Pair style lj/gromacs/coul/gromacs requires atom attribute q"); + // request regular or rRESPA neighbor lists + + if(atom->molecular) + { + cuda->shared_data.pair.collect_forces_later = 1; + } + + int irequest; + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + if (cut_lj_inner >= cut_lj || cut_coul_inner >= cut_coul) + error->all("Pair inner cutoff >= Pair outer cutoff"); + + cut_lj_innersq = cut_lj_inner * cut_lj_inner; + cut_ljsq = cut_lj * cut_lj; + cut_coul_innersq = cut_coul_inner * cut_coul_inner; + cut_coulsq = cut_coul * cut_coul; + cut_bothsq = MAX(cut_ljsq,cut_coulsq); + + + cut_coulsq = cut_coul * cut_coul; + + cuda->shared_data.pair.cut_coulsq_global=cut_coulsq; + + cuda->shared_data.pppm.qqrd2e=force->qqrd2e; +} + +void PairLJGromacsCoulGromacsCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJGromacsCoulGromacsCuda::init_list\n");) + PairLJGromacsCoulGromacs::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJGromacsCoulGromacsCuda::init_list end\n");) +} + +void PairLJGromacsCoulGromacsCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJGromacsCoulGromacs::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.h b/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.h new file mode 100644 index 0000000000..333bbc0088 --- /dev/null +++ b/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.h @@ -0,0 +1,68 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/gromacs/coul/gromacs/cuda,PairLJGromacsCoulGromacsCuda) + +#else + +#ifndef LMP_PAIR_LJ_GROMACS_COUL_GROMACS_CUDA_H +#define LMP_PAIR_LJ_GROMACS_COUL_GROMACS_CUDA_H + +#include "pair_lj_gromacs_coul_gromacs.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class PairLJGromacsCoulGromacsCuda : public PairLJGromacsCoulGromacs +{ + public: + PairLJGromacsCoulGromacsCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; + cCudaData* cu_lj1_gm; + cCudaData* cu_lj2_gm; + cCudaData* cu_lj3_gm; + cCudaData* cu_lj4_gm; + cCudaData* cu_ljsw1_gm; + cCudaData* cu_ljsw2_gm; + cCudaData* cu_ljsw3_gm; + cCudaData* cu_ljsw4_gm; + cCudaData* cu_ljsw5_gm; + +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_gromacs_cuda.cpp b/src/USER-CUDA/pair_lj_gromacs_cuda.cpp new file mode 100644 index 0000000000..97bbbe16f4 --- /dev/null +++ b/src/USER-CUDA/pair_lj_gromacs_cuda.cpp @@ -0,0 +1,182 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_gromacs_cuda.h" +#include "pair_lj_gromacs_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJGromacsCuda::PairLJGromacsCuda(LAMMPS *lmp) : PairLJGromacs(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->shared_data.pair.use_block_per_atom = 0; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJGromacsCuda::allocate() +{ + if(! allocated) PairLJGromacs::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut; + cuda->shared_data.pair.cut_inner = cut_inner; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.coeff5 = ljsw1; + cuda->shared_data.pair.coeff6 = ljsw2; + cuda->shared_data.pair.coeff7 = ljsw3; + cuda->shared_data.pair.coeff8 = ljsw4; + cuda->shared_data.pair.coeff9 = ljsw5; + cuda->shared_data.pair.special_lj = force->special_lj; + cu_lj1_gm = new cCudaData ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj2_gm = new cCudaData ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj3_gm = new cCudaData ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj4_gm = new cCudaData ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw1_gm = new cCudaData ((double*)ljsw1, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw2_gm = new cCudaData ((double*)ljsw2, &cuda->shared_data.pair.coeff6_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw3_gm = new cCudaData ((double*)ljsw3, &cuda->shared_data.pair.coeff7_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw4_gm = new cCudaData ((double*)ljsw4, &cuda->shared_data.pair.coeff8_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw5_gm = new cCudaData ((double*)ljsw5, &cuda->shared_data.pair.coeff9_gm, (atom->ntypes+1)*(atom->ntypes+1)); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJGromacsCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->upload(); + if(vflag) cuda->cu_virial->upload(); + } + + Cuda_PairLJGromacsCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(vflag) cuda->cu_virial->download(); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJGromacsCuda::settings(int narg, char **arg) +{ + PairLJGromacs::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global; + cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_inner_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJGromacsCuda::coeff(int narg, char **arg) +{ + PairLJGromacs::coeff(narg, arg); + allocate(); +} + +void PairLJGromacsCuda::init_style() +{ + // request regular or rRESPA neighbor lists + + if(atom->molecular) + { + cuda->shared_data.pair.collect_forces_later = 1; + } + + int irequest; + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + + +} + +void PairLJGromacsCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJGromacsCuda::init_list\n");) + PairLJGromacs::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJGromacsCuda::init_list end\n");) +} + +void PairLJGromacsCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJGromacs::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_gromacs_cuda.h b/src/USER-CUDA/pair_lj_gromacs_cuda.h new file mode 100644 index 0000000000..64e38aa763 --- /dev/null +++ b/src/USER-CUDA/pair_lj_gromacs_cuda.h @@ -0,0 +1,68 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/gromacs/cuda,PairLJGromacsCuda) + +#else + +#ifndef LMP_PAIR_LJ_GROMACS_CUDA_H +#define LMP_PAIR_LJ_GROMACS_CUDA_H + +#include "pair_lj_gromacs.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class PairLJGromacsCuda : public PairLJGromacs +{ + public: + PairLJGromacsCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; + cCudaData* cu_lj1_gm; + cCudaData* cu_lj2_gm; + cCudaData* cu_lj3_gm; + cCudaData* cu_lj4_gm; + cCudaData* cu_ljsw1_gm; + cCudaData* cu_ljsw2_gm; + cCudaData* cu_ljsw3_gm; + cCudaData* cu_ljsw4_gm; + cCudaData* cu_ljsw5_gm; + +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_lj_smooth_cuda.cpp b/src/USER-CUDA/pair_lj_smooth_cuda.cpp new file mode 100644 index 0000000000..c8aef2ec00 --- /dev/null +++ b/src/USER-CUDA/pair_lj_smooth_cuda.cpp @@ -0,0 +1,182 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + Contributing author: Paul Crozier (SNL) + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_lj_smooth_cuda.h" +#include "pair_lj_smooth_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "kspace.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairLJSmoothCuda::PairLJSmoothCuda(LAMMPS *lmp) : PairLJSmooth(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->shared_data.pair.use_block_per_atom = 0; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairLJSmoothCuda::allocate() +{ + if(! allocated) PairLJSmooth::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut; + cuda->shared_data.pair.cut_inner = cut_inner; + cuda->shared_data.pair.coeff1 = lj1; + cuda->shared_data.pair.coeff2 = lj2; + cuda->shared_data.pair.coeff3 = lj3; + cuda->shared_data.pair.coeff4 = lj4; + cuda->shared_data.pair.coeff5 = ljsw1; + cuda->shared_data.pair.coeff6 = ljsw2; + cuda->shared_data.pair.coeff7 = ljsw3; + cuda->shared_data.pair.coeff8 = ljsw4; + cuda->shared_data.pair.coeff9 = ljsw0; + cuda->shared_data.pair.special_lj = force->special_lj; + cu_lj1_gm = new cCudaData ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj2_gm = new cCudaData ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj3_gm = new cCudaData ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_lj4_gm = new cCudaData ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw0_gm = new cCudaData ((double*)ljsw0, &cuda->shared_data.pair.coeff9_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw1_gm = new cCudaData ((double*)ljsw1, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw2_gm = new cCudaData ((double*)ljsw2, &cuda->shared_data.pair.coeff6_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw3_gm = new cCudaData ((double*)ljsw3, &cuda->shared_data.pair.coeff7_gm, (atom->ntypes+1)*(atom->ntypes+1)); + cu_ljsw4_gm = new cCudaData ((double*)ljsw4, &cuda->shared_data.pair.coeff8_gm, (atom->ntypes+1)*(atom->ntypes+1)); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJSmoothCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->upload(); + if(vflag) cuda->cu_virial->upload(); + } + + Cuda_PairLJSmoothCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(vflag) cuda->cu_virial->download(); + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJSmoothCuda::settings(int narg, char **arg) +{ + PairLJSmooth::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global; + cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_inner_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJSmoothCuda::coeff(int narg, char **arg) +{ + PairLJSmooth::coeff(narg, arg); + allocate(); +} + +void PairLJSmoothCuda::init_style() +{ + // request regular or rRESPA neighbor lists + + if(atom->molecular) + { + cuda->shared_data.pair.collect_forces_later = 1; + } + + int irequest; + + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + + + +} + +void PairLJSmoothCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairLJSmoothCuda::init_list\n");) + PairLJSmooth::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairLJSmoothCuda::init_list end\n");) +} + +void PairLJSmoothCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairLJSmooth::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_lj_smooth_cuda.h b/src/USER-CUDA/pair_lj_smooth_cuda.h new file mode 100644 index 0000000000..32f6b4fabc --- /dev/null +++ b/src/USER-CUDA/pair_lj_smooth_cuda.h @@ -0,0 +1,68 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/smooth/cuda,PairLJSmoothCuda) + +#else + +#ifndef LMP_PAIR_LJ_SMOOTH_CUDA_H +#define LMP_PAIR_LJ_SMOOTH_CUDA_H + +#include "pair_lj_smooth.h" +#include "cuda_data.h" + +namespace LAMMPS_NS { + +class PairLJSmoothCuda : public PairLJSmooth +{ + public: + PairLJSmoothCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; + cCudaData* cu_lj1_gm; + cCudaData* cu_lj2_gm; + cCudaData* cu_lj3_gm; + cCudaData* cu_lj4_gm; + cCudaData* cu_ljsw0_gm; + cCudaData* cu_ljsw1_gm; + cCudaData* cu_ljsw2_gm; + cCudaData* cu_ljsw3_gm; + cCudaData* cu_ljsw4_gm; + +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pair_morse_cuda.cpp b/src/USER-CUDA/pair_morse_cuda.cpp new file mode 100644 index 0000000000..b556c158d6 --- /dev/null +++ b/src/USER-CUDA/pair_morse_cuda.cpp @@ -0,0 +1,182 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "pair_morse_cuda.h" +#include "pair_morse_cuda_cu.h" +#include "cuda_data.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "cuda_neigh_list.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "memory.h" +#include "error.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PairMorseCuda::PairMorseCuda(LAMMPS *lmp) : PairMorse(lmp) +{ + cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + allocated2 = false; + cuda->shared_data.pair.cudable_force = 1; + cuda->setSystemParams(); +} + +/* ---------------------------------------------------------------------- + remember pointer to arrays in cuda shared data +------------------------------------------------------------------------- */ + +void PairMorseCuda::allocate() +{ + if(! allocated) PairMorse::allocate(); + if(! allocated2) + { + allocated2 = true; + cuda->shared_data.pair.cut = cut; + cuda->shared_data.pair.coeff1 = r0; + cuda->shared_data.pair.coeff2 = alpha; + cuda->shared_data.pair.coeff3 = morse1; + cuda->shared_data.pair.coeff4 = d0; + cuda->shared_data.pair.offset = offset; + cuda->shared_data.pair.special_lj = force->special_lj; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairMorseCuda::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + if(eflag) cuda->cu_eng_vdwl->upload(); + if(vflag) cuda->cu_virial->upload(); + + Cuda_PairMorseCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom); + + if(not cuda->shared_data.pair.collect_forces_later) + { + if(eflag) cuda->cu_eng_vdwl->download(); + if(vflag) cuda->cu_virial->download(); + } + +} + +/* ---------------------------------------------------------------------- */ + +void PairMorseCuda::settings(int narg, char **arg) +{ + PairMorse::settings(narg, arg); + cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global; +} + +/* ---------------------------------------------------------------------- */ + +void PairMorseCuda::coeff(int narg, char **arg) +{ + PairMorse::coeff(narg, arg); + allocate(); +} + +void PairMorseCuda::init_style() +{ + MYDBG(printf("# CUDA PairMorseCuda::init_style start\n"); ) + // request regular or rRESPA neighbor lists + + int irequest; + + if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) { + + } + else + { + irequest = neighbor->request(this); + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->cudable = 1; + //neighbor->style=0; //0=NSQ neighboring + } + + + MYDBG(printf("# CUDA PairMorseCuda::init_style end\n"); ) +} + +void PairMorseCuda::init_list(int id, NeighList *ptr) +{ + MYDBG(printf("# CUDA PairMorseCuda::init_list\n");) + PairMorse::init_list(id, ptr); + #ifndef CUDA_USE_BINNING + // right now we can only handle verlet (id 0), not respa + if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr); + // see Neighbor::init() for details on lammps lists' logic + #endif + MYDBG(printf("# CUDA PairMorseCuda::init_list end\n");) +} + +void PairMorseCuda::ev_setup(int eflag, int vflag) +{ + int maxeatomold=maxeatom; + PairMorse::ev_setup(eflag,vflag); + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax );} + + if (eflag_atom && atom->nmax > maxeatomold) + {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6 );} + +} + + diff --git a/src/USER-CUDA/pair_morse_cuda.h b/src/USER-CUDA/pair_morse_cuda.h new file mode 100644 index 0000000000..aae40294ba --- /dev/null +++ b/src/USER-CUDA/pair_morse_cuda.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(morse/cuda,PairMorseCuda) + +#else + +#ifndef PAIR_MORSE_CUDA_H +#define PAIR_MORSE_CUDA_H + +#include "pair_morse.h" + +namespace LAMMPS_NS { + +class PairMorseCuda : public PairMorse +{ + public: + PairMorseCuda(class LAMMPS *); + void compute(int, int); + void settings(int, char **); + void coeff(int, char **); + void init_list(int, class NeighList *); + void init_style(); + void ev_setup(int eflag, int vflag); + protected: + class Cuda *cuda; + void allocate(); + bool allocated2; + class CudaNeighList* cuda_neigh_list; +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/pppm_cuda.cpp b/src/USER-CUDA/pppm_cuda.cpp new file mode 100644 index 0000000000..16ef9ae49f --- /dev/null +++ b/src/USER-CUDA/pppm_cuda.cpp @@ -0,0 +1,1741 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL) +------------------------------------------------------------------------- */ + + +#include "mpi.h" +#include +#include +#include +#include +#include "pppm_cuda.h" +#include "atom.h" +#include "comm.h" +#include "neighbor.h" +#include "force.h" +#include "pair.h" +#include "bond.h" +#include "angle.h" +#include "domain.h" +#include "fft3d_wrap_cuda.h" +#include "remap_wrap.h" +#include "memory.h" +#include "error.h" +#include //crmadd +#include "cuda_wrapper_cu.h" +#include "pppm_cuda_cu.h" +#include "cuda.h" + +using namespace LAMMPS_NS; + +#define MAXORDER 7 +#define OFFSET 4096 +#define SMALL 0.00001 +#define LARGE 10000.0 +#define EPS_HOC 1.0e-7 + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + + +void printArray(double* data,int nx, int ny, int nz) +{ + for(int i=0;icuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); + + if ((narg > 3)||(narg<1)) error->all("Illegal kspace_style pppm/cuda command"); + #ifndef FFT_CUFFT + error->all("Using kspace_style pppm/cuda without cufft is not possible. Compile with cufft=1 to include cufft. Aborting."); + #endif + precision = atof(arg[0]); + if(narg>1) + precisionmodify=arg[1][0]; + else precisionmodify='='; + PI = 4.0*atan(1.0); + + nfactors = 3; + factors = new int[nfactors]; + factors[0] = 2; + factors[1] = 3; + factors[2] = 5; + + MPI_Comm_rank(world,&me); + MPI_Comm_size(world,&nprocs); + + density_brick = vdx_brick = vdy_brick = vdz_brick = vdx_brick_tmp = NULL; + density_fft = NULL; + greensfn = NULL; + work1 = work2 = NULL; + vg = NULL; + fkx = fky = fkz = NULL; + buf1 = buf2 = NULL; + + gf_b = NULL; + rho1d = rho_coeff = NULL; + + fft1c = fft2c = NULL; + remap = NULL; + + density_brick_int=NULL; + density_intScale=1000000; + cu_vdx_brick = cu_vdy_brick = cu_vdz_brick = NULL; + cu_density_brick = NULL; + cu_density_brick_int = NULL; + cu_density_fft = NULL; + cu_energy=NULL; + cu_greensfn = NULL; + cu_work1 = cu_work2 = cu_work3 = NULL; + cu_vg = NULL; + cu_fkx = cu_fky = cu_fkz = NULL; + + cu_flag = NULL; + cu_debugdata = NULL; + cu_rho_coeff = NULL; + cu_virial = NULL; + + cu_gf_b = NULL; + + cu_slabbuf = NULL; + slabbuf = NULL; + + nmax = 0; + part2grid = NULL; + cu_part2grid = NULL; + adev_data_array=NULL; + poissontime=0; + old_nmax=0; + cu_pppm_grid_n=NULL; + cu_pppm_grid_ids=NULL; + + pppm_grid_nmax=0; + pppm2partgrid=new int[3]; + pppm_grid=new int[3]; + firstpass=true; + scale = 1.0; +} + + +/* ---------------------------------------------------------------------- + free all memory +------------------------------------------------------------------------- */ + +PPPMCuda::~PPPMCuda() +{ + delete [] slabbuf; + delete cu_slabbuf; + + delete [] factors; + factors=NULL; + deallocate(); + delete cu_part2grid; + cu_part2grid=NULL; + memory->destroy(part2grid); + part2grid = NULL; +} + +/* ---------------------------------------------------------------------- + called once before run +------------------------------------------------------------------------- */ + +void PPPMCuda::init() +{ + + cuda->shared_data.pppm.cudable_force=1; + + //if(cuda->finished_run) {PPPM::init(); return;} + + if (me == 0) { + if (screen) fprintf(screen,"PPPMCuda initialization ...\n"); + if (logfile) fprintf(logfile,"PPPMCuda initialization ...\n"); + } + + // error check + + if (domain->triclinic) + error->all("Cannot (yet) use PPPMCuda with triclinic box"); + if (domain->dimension == 2) error->all("Cannot use PPPMCuda with 2d simulation"); + + if (!atom->q_flag) error->all("Kspace style requires atom attribute q"); + + if (slabflag == 0 && domain->nonperiodic > 0) + error->all("Cannot use nonperiodic boundaries with PPPMCuda"); + if (slabflag == 1) { + if (domain->xperiodic != 1 || domain->yperiodic != 1 || + domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1) + error->all("Incorrect boundaries with slab PPPMCuda"); + } + + if (order > MAXORDER) { + char str[128]; + sprintf(str,"PPPMCuda order cannot be greater than %d",MAXORDER); + error->all(str); + } + // free all arrays previously allocated + + deallocate(); + + // extract short-range Coulombic cutoff from pair style + + qqrd2e = force->qqrd2e; + + if (force->pair == NULL) + error->all("KSpace style is incompatible with Pair style"); + int itmp=0; + double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp); + if (p_cutoff == NULL) + error->all("KSpace style is incompatible with Pair style"); + cutoff = *p_cutoff; + + // if kspace is TIP4P, extract TIP4P params from pair style + + qdist = 0.0; + + if (strcmp(force->kspace_style,"pppm/tip4p") == 0) { + if (force->pair == NULL) + error->all("KSpace style is incompatible with Pair style"); + double *p_qdist = (double *) force->pair->extract("qdist",itmp); + int *p_typeO = (int *) force->pair->extract("typeO",itmp); + int *p_typeH = (int *) force->pair->extract("typeH",itmp); + int *p_typeA = (int *) force->pair->extract("typeA",itmp); + int *p_typeB = (int *) force->pair->extract("typeB",itmp); + if (!p_qdist || !p_typeO || !p_typeH || !p_typeA || !p_typeB) + error->all("KSpace style is incompatible with Pair style"); + qdist = *p_qdist; + typeO = *p_typeO; + typeH = *p_typeH; + int typeA = *p_typeA; + int typeB = *p_typeB; + + if (force->angle == NULL || force->bond == NULL) + error->all("Bond and angle potentials must be defined for TIP4P"); + double theta = force->angle->equilibrium_angle(typeA); + double blen = force->bond->equilibrium_distance(typeB); + alpha = qdist / (2.0 * cos(0.5*theta) * blen); + } + + // compute qsum & qsqsum and warn if not charge-neutral + + qsum = qsqsum = 0.0; + for (int i = 0; i < atom->nlocal; i++) { + qsum += atom->q[i]; + qsqsum += atom->q[i]*atom->q[i]; + } + + double tmp; + MPI_Allreduce(&qsum,&tmp,1,MPI_DOUBLE,MPI_SUM,world); + qsum = tmp; + MPI_Allreduce(&qsqsum,&tmp,1,MPI_DOUBLE,MPI_SUM,world); + qsqsum = tmp; + + if (qsqsum == 0.0) + error->all("Cannot use kspace solver on system with no charge"); + if (fabs(qsum) > SMALL && me == 0) { + char str[128]; + sprintf(str,"System is not charge neutral, net charge = %g",qsum); + error->warning(str); + } + + // setup FFT grid resolution and g_ewald + // normally one iteration thru while loop is all that is required + // if grid stencil extends beyond neighbor proc, reduce order and try again + + int iteration = 0; + + while (order > 0) { + + if (iteration && me == 0) + error->warning("Reducing PPPMCuda order b/c stencil extends " + "beyond neighbor processor"); + iteration++; + + set_grid(); + + if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET) + error->all("PPPMCuda grid is too large"); + + // global indices of PPPMCuda grid range from 0 to N-1 + // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of + // global PPPMCuda grid that I own without ghost cells + // for slab PPPMCuda, assign z grid as if it were not extended + + nxlo_in = comm->myloc[0]*nx_pppm / comm->procgrid[0]; + nxhi_in = (comm->myloc[0]+1)*nx_pppm / comm->procgrid[0] - 1; + nylo_in = comm->myloc[1]*ny_pppm / comm->procgrid[1]; + nyhi_in = (comm->myloc[1]+1)*ny_pppm / comm->procgrid[1] - 1; + nzlo_in = comm->myloc[2] * + (static_cast (nz_pppm/slab_volfactor)) / comm->procgrid[2]; + nzhi_in = (comm->myloc[2]+1) * + (static_cast (nz_pppm/slab_volfactor)) / comm->procgrid[2] - 1; + + // nlower,nupper = stencil size for mapping particles to PPPMCuda grid + + nlower = -(order-1)/2; + nupper = order/2; + + // shift values for particle <-> grid mapping + // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 + + if (order % 2) shift = OFFSET + 0.5; + else shift = OFFSET; + if (order % 2) shiftone = 0.0; + else shiftone = 0.5; + + // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of + // global PPPMCuda grid that my particles can contribute charge to + // effectively nlo_in,nhi_in + ghost cells + // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest + // position a particle in my box can be at + // dist[3] = particle position bound = subbox + skin/2.0 + qdist + // qdist = offset due to TIP4P fictitious charge + // convert to triclinic if necessary + // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping + // for slab PPPMCuda, assign z grid as if it were not extended + + + triclinic = domain->triclinic; + double *prd,*sublo,*subhi; + + if (triclinic == 0) { + prd = domain->prd; + boxlo = domain->boxlo; + sublo = domain->sublo; + subhi = domain->subhi; + } else { + prd = domain->prd_lamda; + boxlo = domain->boxlo_lamda; + sublo = domain->sublo_lamda; + subhi = domain->subhi_lamda; + } + + double xprd = prd[0]; + double yprd = prd[1]; + double zprd = prd[2]; + double zprd_slab = zprd*slab_volfactor; + + double dist[3]; + double cuthalf = 0.5*neighbor->skin + qdist; + if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf; + else { + dist[0] = cuthalf/domain->prd[0]; + dist[1] = cuthalf/domain->prd[1]; + dist[2] = cuthalf/domain->prd[2]; + } + + int nlo,nhi; + + nlo = static_cast ((sublo[0]-dist[0]-boxlo[0]) * + nx_pppm/xprd + shift) - OFFSET; + nhi = static_cast ((subhi[0]+dist[0]-boxlo[0]) * + nx_pppm/xprd + shift) - OFFSET; + nxlo_out = nlo + nlower; + nxhi_out = nhi + nupper; + + nlo = static_cast ((sublo[1]-dist[1]-boxlo[1]) * + ny_pppm/yprd + shift) - OFFSET; + nhi = static_cast ((subhi[1]+dist[1]-boxlo[1]) * + ny_pppm/yprd + shift) - OFFSET; + nylo_out = nlo + nlower; + nyhi_out = nhi + nupper; + + nlo = static_cast ((sublo[2]-dist[2]-boxlo[2]) * + nz_pppm/zprd_slab + shift) - OFFSET; + nhi = static_cast ((subhi[2]+dist[2]-boxlo[2]) * + nz_pppm/zprd_slab + shift) - OFFSET; + nzlo_out = nlo + nlower; + nzhi_out = nhi + nupper; + + // for slab PPPMCuda, change the grid boundary for processors at +z end + // to include the empty volume between periodically repeating slabs + // for slab PPPMCuda, want charge data communicated from -z proc to +z proc, + // but not vice versa, also want field data communicated from +z proc to + // -z proc, but not vice versa + // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells) + + if (slabflag && ((comm->myloc[2]+1) == (comm->procgrid[2]))) { + nzhi_in = nz_pppm - 1; + nzhi_out = nz_pppm - 1; + } + + // nlo_ghost,nhi_ghost = # of planes I will recv from 6 directions + // that overlay domain I own + // proc in that direction tells me via sendrecv() + // if no neighbor proc, value is from self since I have ghosts regardless + + int nplanes; + MPI_Status status; + + nplanes = nxlo_in - nxlo_out; + if (comm->procneigh[0][0] != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][0],0, + &nxhi_ghost,1,MPI_INT,comm->procneigh[0][1],0, + world,&status); + else nxhi_ghost = nplanes; + + nplanes = nxhi_out - nxhi_in; + if (comm->procneigh[0][1] != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][1],0, + &nxlo_ghost,1,MPI_INT,comm->procneigh[0][0], + 0,world,&status); + else nxlo_ghost = nplanes; + + nplanes = nylo_in - nylo_out; + if (comm->procneigh[1][0] != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][0],0, + &nyhi_ghost,1,MPI_INT,comm->procneigh[1][1],0, + world,&status); + else nyhi_ghost = nplanes; + + nplanes = nyhi_out - nyhi_in; + if (comm->procneigh[1][1] != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][1],0, + &nylo_ghost,1,MPI_INT,comm->procneigh[1][0],0, + world,&status); + else nylo_ghost = nplanes; + + nplanes = nzlo_in - nzlo_out; + if (comm->procneigh[2][0] != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][0],0, + &nzhi_ghost,1,MPI_INT,comm->procneigh[2][1],0, + world,&status); + else nzhi_ghost = nplanes; + + nplanes = nzhi_out - nzhi_in; + if (comm->procneigh[2][1] != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][1],0, + &nzlo_ghost,1,MPI_INT,comm->procneigh[2][0],0, + world,&status); + else nzlo_ghost = nplanes; + + // test that ghost overlap is not bigger than my sub-domain + + int flag = 0; + if (nxlo_ghost > nxhi_in-nxlo_in+1) flag = 1; + if (nxhi_ghost > nxhi_in-nxlo_in+1) flag = 1; + if (nylo_ghost > nyhi_in-nylo_in+1) flag = 1; + if (nyhi_ghost > nyhi_in-nylo_in+1) flag = 1; + if (nzlo_ghost > nzhi_in-nzlo_in+1) flag = 1; + if (nzhi_ghost > nzhi_in-nzlo_in+1) flag = 1; + + int flag_all; + MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world); + + if (flag_all == 0) break; + order--; + } + + if (order == 0) error->all("PPPMCuda order has been reduced to 0"); + //printf("PPPMCuda: order is %i\n"); + + + + // decomposition of FFT mesh + // global indices range from 0 to N-1 + // proc owns entire x-dimension, clump of columns in y,z dimensions + // npey_fft,npez_fft = # of procs in y,z dims + // if nprocs is small enough, proc can own 1 or more entire xy planes, + // else proc owns 2d sub-blocks of yz plane + // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions + // nlo_fft,nhi_fft = lower/upper limit of the section + // of the global FFT mesh that I own + + int npey_fft,npez_fft; + if (nz_pppm >= nprocs) { + npey_fft = 1; + npez_fft = nprocs; + } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft); + + int me_y = me % npey_fft; + int me_z = me / npey_fft; + + nxlo_fft = 0; + nxhi_fft = nx_pppm - 1; + nylo_fft = me_y*ny_pppm/npey_fft; + nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1; + nzlo_fft = me_z*nz_pppm/npez_fft; + nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1; + + // PPPMCuda grid for this proc, including ghosts + + ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * + (nzhi_out-nzlo_out+1); + + // FFT arrays on this proc, without ghosts + // nfft = FFT points in FFT decomposition on this proc + // nfft_brick = FFT points in 3d brick-decomposition on this proc + // nfft_both = greater of 2 values + + nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) * + (nzhi_fft-nzlo_fft+1); + int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) * + (nzhi_in-nzlo_in+1); + nfft_both = MAX(nfft,nfft_brick); + + // buffer space for use in brick2fft and fillbrick + // idel = max # of ghost planes to send or recv in +/- dir of each dim + // nx,ny,nz = owned planes (including ghosts) in each dim + // nxx,nyy,nzz = max # of grid cells to send in each dim + // nbuf = max in any dim, augment by 3x for components of vd_xyz in fillbrick + + int idelx,idely,idelz,nx,ny,nz,nxx,nyy,nzz; + + idelx = MAX(nxlo_ghost,nxhi_ghost); + idelx = MAX(idelx,nxhi_out-nxhi_in); + idelx = MAX(idelx,nxlo_in-nxlo_out); + + idely = MAX(nylo_ghost,nyhi_ghost); + idely = MAX(idely,nyhi_out-nyhi_in); + idely = MAX(idely,nylo_in-nylo_out); + + idelz = MAX(nzlo_ghost,nzhi_ghost); + idelz = MAX(idelz,nzhi_out-nzhi_in); + idelz = MAX(idelz,nzlo_in-nzlo_out); + + nx = nxhi_out - nxlo_out + 1; + ny = nyhi_out - nylo_out + 1; + nz = nzhi_out - nzlo_out + 1; + + nxx = idelx * ny * nz; + nyy = idely * nx * nz; + nzz = idelz * nx * ny; + + nbuf = MAX(nxx,nyy); + nbuf = MAX(nbuf,nzz); + nbuf *= 3; + + // print stats + + int ngrid_max,nfft_both_max,nbuf_max; + MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world); + MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world); + MPI_Allreduce(&nbuf,&nbuf_max,1,MPI_INT,MPI_MAX,world); + + if (me == 0) { + if (screen) fprintf(screen," brick FFT buffer size/proc = %d %d %d\n", + ngrid_max,nfft_both_max,nbuf_max); + if (logfile) fprintf(logfile," brick FFT buffer size/proc = %d %d %d\n", + ngrid_max,nfft_both_max,nbuf_max); + } +cuda_shared_pppm* ap=&(cuda->shared_data.pppm); + + ap->density_intScale=density_intScale; + ap->nxlo_in=nxlo_in; + ap->nxhi_in=nxhi_in; + ap->nxlo_out=nxlo_out; + ap->nxhi_out=nxhi_out; + ap->nylo_in=nylo_in; + ap->nyhi_in=nyhi_in; + ap->nylo_out=nylo_out; + ap->nyhi_out=nyhi_out; + ap->nzlo_in=nzlo_in; + ap->nzhi_in=nzhi_in; + ap->nzlo_out=nzlo_out; + ap->nzhi_out=nzhi_out; + ap->nxlo_in=nxlo_fft; + ap->nxhi_in=nxhi_fft; + ap->nylo_in=nylo_fft; + ap->nyhi_in=nyhi_fft; + ap->nzlo_in=nzlo_fft; + ap->nzhi_in=nzhi_fft; + ap->nx_pppm=nx_pppm; + ap->ny_pppm=ny_pppm; + ap->nz_pppm=nz_pppm; + ap->qqrd2e=qqrd2e; + ap->order=order; + ap->nmax=nmax; + ap->nlocal=atom->nlocal; + ap->delxinv=delxinv; + ap->delyinv=delyinv; + ap->delzinv=delzinv; + ap->nlower=nlower; + ap->nupper=nupper; + ap->shiftone=shiftone; + + // allocate K-space dependent memory + + + allocate(); + + // pre-compute Green's function denomiator expansion + // pre-compute 1d charge distribution coefficients + + compute_gf_denom(); + compute_rho_coeff(); +} + +/* ---------------------------------------------------------------------- + adjust PPPMCuda coeffs, called initially and whenever volume has changed +------------------------------------------------------------------------- */ + +void PPPMCuda::setup() +{ + int i,j,k,l,m,n; + double *prd; + cu_gf_b->upload(); + // volume-dependent factors + // adjust z dimension for 2d slab PPPMCuda + // z dimension for 3d PPPMCuda is zprd since slab_volfactor = 1.0 + + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double xprd = prd[0]; + double yprd = prd[1]; + double zprd = prd[2]; + double zprd_slab = zprd*slab_volfactor; + volume = xprd * yprd * zprd_slab; + + delxinv = nx_pppm/xprd; + delyinv = ny_pppm/yprd; + delzinv = nz_pppm/zprd_slab; + + delvolinv = delxinv*delyinv*delzinv; + + double unitkx = (2.0*PI/xprd); + double unitky = (2.0*PI/yprd); + double unitkz = (2.0*PI/zprd_slab); + + // fkx,fky,fkz for my FFT grid pts + Cuda_PPPM_Setup_fkxyz_vg(nx_pppm, ny_pppm,nz_pppm,unitkx,unitky,unitkz,g_ewald); + + +/* cu_vg->download(); + int offset=8100-2;//10*(nxhi_fft-nxlo_fft+1)*(nyhi_fft-nylo_fft+1)+10*(nyhi_fft-nylo_fft+1); + for (int i=nxlo_fft; i <= nxhi_fft+1;i++) printf("%e ",vg[i-nxlo_fft+offset][0]); + printf("\n\n"); + double per; + + #ifndef FFT_CUFFT + for (i = nxlo_fft; i <= nxhi_fft; i++) { + per = i - nx_pppm*(2*i/nx_pppm); + fkx[i] = unitkx*per; + } + + for (i = nylo_fft; i <= nyhi_fft; i++) { + per = i - ny_pppm*(2*i/ny_pppm); + fky[i] = unitky*per; + } + + for (i = nzlo_fft; i <= nzhi_fft; i++) { + per = i - nz_pppm*(2*i/nz_pppm); + fkz[i] = unitkz*per; + } + #endif + #ifdef FFT_CUFFT + for (i = 0; i < nx_pppm; i++) { + per = i - nx_pppm*(2*i/nx_pppm); + fkx[i] = unitkx*per; + } + + for (i = 0; i < ny_pppm; i++) { + per = i - ny_pppm*(2*i/ny_pppm); + fky[i] = unitky*per; + } + + for (i = 0; i < nz_pppm; i++) { + per = i - nz_pppm*(2*i/nz_pppm); + fkz[i] = unitkz*per; + } + #endif + + // virial coefficients + + double sqk,vterm; +int save_n=0; +int s_i,s_j,s_k; +double max=0.0; + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) { + for (j = nylo_fft; j <= nyhi_fft; j++) { + for (i = nxlo_fft; i <= nxhi_fft; i++) { + sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k]; + if(n==8100) printf("%lf\n",sqk); + if (sqk == 0.0) { + vg[n][0] = 0.0; + vg[n][1] = 0.0; + vg[n][2] = 0.0; + vg[n][3] = 0.0; + vg[n][4] = 0.0; + vg[n][5] = 0.0; + } else { + vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald)); + double tmp=vg[n][0]; + vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i]; + if(((vg[n][0]-tmp)*(vg[n][0]-tmp)>1e-6)&&(save_n==0)) {save_n=n;s_k=k;s_j=j;s_i=i;} + vg[n][1] = 1.0 + vterm*fky[j]*fky[j]; + vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k]; + vg[n][3] = vterm*fkx[i]*fky[j]; + vg[n][4] = vterm*fkx[i]*fkz[k]; + vg[n][5] = vterm*fky[j]*fkz[k]; + //if(vg[n][0]>max) {max=vg[n][0]; save_n=n;} + } + n++; + } + } + } + printf("%lf %i %i %i %i\n",max,save_n,s_k,s_j,s_i); + for (int i=nxlo_fft; i <= nxhi_fft;i++) printf("%e ",vg[i-nxlo_fft+offset][0]); + printf("\n\n"); + + //cu_fkx->upload(); + //cu_fky->upload(); + // cu_fkz->upload(); + //cu_vg->upload(); */ + // modified (Hockney-Eastwood) Coulomb Green's function + +double sqk; + int nx,ny,nz,kper,lper,mper; + double snx,sny,snz,snx2,sny2,snz2; + double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; + double sum1,dot1,dot2; + double numerator,denominator; + + int nbx = static_cast ((g_ewald*xprd/(PI*nx_pppm)) * + pow(-log(EPS_HOC),0.25)); + int nby = static_cast ((g_ewald*yprd/(PI*ny_pppm)) * + pow(-log(EPS_HOC),0.25)); + int nbz = static_cast ((g_ewald*zprd_slab/(PI*nz_pppm)) * + pow(-log(EPS_HOC),0.25)); + Cuda_PPPM_setup_greensfn(nx_pppm,ny_pppm,nz_pppm,unitkx,unitky,unitkz,g_ewald, +nbx,nby,nbz,xprd,yprd,zprd_slab); +/* + double form = 1.0; + + n = 0; +#ifndef FFT_CUFFT + for (m = nzlo_fft; m <= nzhi_fft; m++) { +#endif +#ifdef FFT_CUFFT + for (m = 0; m < nz_pppm; m++) { +#endif + mper = m - nz_pppm*(2*m/nz_pppm); + snz = sin(0.5*unitkz*mper*zprd_slab/nz_pppm); + snz2 = snz*snz; + +#ifndef FFT_CUFFT + for (l = nylo_fft; l <= nyhi_fft; l++) { +#endif +#ifdef FFT_CUFFT + for (l = 0; l < ny_pppm; l++) { +#endif + lper = l - ny_pppm*(2*l/ny_pppm); + sny = sin(0.5*unitky*lper*yprd/ny_pppm); + sny2 = sny*sny; + +#ifndef FFT_CUFFT + for (k = nxlo_fft; k <= nxhi_fft; k++) { +#endif +#ifdef FFT_CUFFT + for (k = 0; k < nx_pppm; k++) { +#endif + kper = k - nx_pppm*(2*k/nx_pppm); + snx = sin(0.5*unitkx*kper*xprd/nx_pppm); + snx2 = snx*snx; + + sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + + pow(unitkz*mper,2.0); + + if (sqk != 0.0) { + numerator = form*12.5663706/sqk; + denominator = gf_denom(snx2,sny2,snz2); + sum1 = 0.0; + for (nx = -nbx; nx <= nbx; nx++) { + qx = unitkx*(kper+nx_pppm*nx); + sx = exp(-.25*pow(qx/g_ewald,2.0)); + wx = 1.0; + argx = 0.5*qx*xprd/nx_pppm; + if (argx != 0.0) wx = pow(sin(argx)/argx,order); + for (ny = -nby; ny <= nby; ny++) { + qy = unitky*(lper+ny_pppm*ny); + sy = exp(-.25*pow(qy/g_ewald,2.0)); + wy = 1.0; + argy = 0.5*qy*yprd/ny_pppm; + if (argy != 0.0) wy = pow(sin(argy)/argy,order); + for (nz = -nbz; nz <= nbz; nz++) { + qz = unitkz*(mper+nz_pppm*nz); + sz = exp(-.25*pow(qz/g_ewald,2.0)); + wz = 1.0; + argz = 0.5*qz*zprd_slab/nz_pppm; + if (argz != 0.0) wz = pow(sin(argz)/argz,order); + + dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz; + dot2 = qx*qx+qy*qy+qz*qz; + sum1 += (dot1/dot2) * sx*sy*sz * pow(wx*wy*wz,2.0); + } + } + } + greensfn[n++] = numerator*sum1/denominator; + } else greensfn[n++] = 0.0; + } + } + }*/ + +#ifdef FFT_CUFFT + //cu_greensfn->upload(); + //cu_fkx->upload(); + //cu_fky->upload(); + //cu_fkz->upload(); + //cu_vg->upload(); + cu_vdx_brick->upload(); + cu_vdy_brick->upload(); + cu_vdz_brick->upload(); + +#endif + cu_rho_coeff->upload(); + cu_density_brick->memset_device(0); + pppm_device_init_setup(&cuda->shared_data,shiftone,delxinv,delyinv,delzinv,nlower,nupper); +} + +/* ---------------------------------------------------------------------- + compute the PPPMCuda long-range force, energy, virial +------------------------------------------------------------------------- */ + +void PPPMCuda::compute(int eflag, int vflag) +{ + +// printf("PPPMCuda::compute START\n"); + cuda_shared_atom* cu_atom = & cuda->shared_data.atom; + + int i; + timespec starttime; + timespec endtime; + timespec starttotal; + timespec endtotal; + // convert atoms from box to lamda coords + + if (triclinic == 0) boxlo = domain->boxlo; + else { + boxlo = domain->boxlo_lamda; + domain->x2lamda(atom->nlocal); + } + + // extend size of per-atom arrays if necessary + + if ((cu_atom->update_nmax)||(old_nmax==0)) { + memory->destroy(part2grid); + nmax = atom->nmax; + memory->create(part2grid,nmax,3,"pppm:part2grid"); + delete cu_part2grid; + delete adev_data_array; + adev_data_array=new dev_array[1]; + cu_part2grid = new cCudaData ((int*)part2grid,adev_data_array, nmax,3); + + pppm_device_update(&cuda->shared_data,cu_part2grid->dev_data(),atom->nlocal,atom->nmax); + old_nmax=nmax; + } + if(cu_atom->update_nlocal) {pppm_update_nlocal(cu_atom->nlocal);} + + energy = 0.0; + if (vflag) + { + for (i = 0; i < 6; i++) virial[i] = 0.0; + cu_virial->memset_device(0); + } + if(eflag) cu_energy->memset_device(0); + clock_gettime(CLOCK_REALTIME,&starttotal); + + // find grid points for all my particles + // map my particle charge onto my local 3d density grid + + + clock_gettime(CLOCK_REALTIME,&starttime); + + particle_map(); + + clock_gettime(CLOCK_REALTIME,&endtime); + cuda->shared_data.cuda_timings.pppm_particle_map+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); + + //cu_part2grid->download(); + clock_gettime(CLOCK_REALTIME,&starttime); + make_rho(); + clock_gettime(CLOCK_REALTIME,&endtime); + cuda->shared_data.cuda_timings.pppm_make_rho+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); + + // all procs communicate density values from their ghost cells + // to fully sum contribution in their 3d bricks + // remap from 3d decomposition to FFT decomposition + + int nprocs=comm->nprocs; + + clock_gettime(CLOCK_REALTIME,&starttime); + +if(nprocs>1) +{ + cu_density_brick->download(); + brick2fft(); +} +else +{ + #ifdef FFT_CUFFT + pppm_initfftdata(&cuda->shared_data,(PPPM_FLOAT*)cu_density_brick->dev_data(),(FFT_FLOAT*)cu_work2->dev_data()); + #endif +} + + clock_gettime(CLOCK_REALTIME,&endtime); + cuda->shared_data.cuda_timings.pppm_brick2fft+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); + + // compute potential gradient on my FFT grid and + // portion of e_long on this proc's FFT grid + // return gradients (electric fields) in 3d brick decomposition + + clock_gettime(CLOCK_REALTIME,&starttime); + poisson(eflag,vflag); + clock_gettime(CLOCK_REALTIME,&endtime); + cuda->shared_data.cuda_timings.pppm_poisson+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); + + // all procs communicate E-field values to fill ghost cells + // surrounding their 3d bricks + + // not necessary since all the calculations are done on one proc + + // calculate the force on my particles + //cu_vdx_brick->download(); + //cu_vdy_brick->download(); + //cu_vdz_brick->download(); + + clock_gettime(CLOCK_REALTIME,&starttime); + fieldforce(); + clock_gettime(CLOCK_REALTIME,&endtime); + cuda->shared_data.cuda_timings.pppm_fieldforce+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); + + // sum energy across procs and add in volume-dependent term + + clock_gettime(CLOCK_REALTIME,&endtotal); + cuda->shared_data.cuda_timings.pppm_compute+=(endtotal.tv_sec-starttotal.tv_sec+1.0*(endtotal.tv_nsec-starttotal.tv_nsec)/1000000000); + + if (eflag) { + double energy_all; + MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); + energy = energy_all; + + energy *= 0.5*volume; + energy -= g_ewald*qsqsum/1.772453851 + + 0.5*PI*qsum*qsum / (g_ewald*g_ewald*volume); + energy *= qqrd2e; + } + + // sum virial across procs + + if (vflag) { + double virial_all[6]; + MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world); + for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*volume*virial_all[i]; + } + + // 2d slab correction + + if (slabflag) slabcorr(eflag); + + // convert atoms back from lamda to box coords + + if (triclinic) domain->lamda2x(atom->nlocal); + + if(firstpass) firstpass=false; +} + + +/* ---------------------------------------------------------------------- + allocate memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + + +void PPPMCuda::allocate() +{ + //printf("PPPMCuda::allocate START Mem: %i\n",CudaWrapper_CheckMemUseage()); +/*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision\n"); + +#ifdef PPPM_PRECISION +if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision for pppm core\n"); +if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda: Using double precision for pppm core\n"); +#endif +#ifdef ENERGY_PRECISION +if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision for energy\n"); +if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda: Using double precision for energy\n"); +#endif +#ifdef ENERGY_PRECISION +if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision for fft\n"); +if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda: Using double precision for fft\n"); +#endif +#ifdef X_PRECISION +if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision for positions\n"); +if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda: Using double precision for positions\n"); +#endif +#ifdef F_PRECISION +if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision for forces\n"); +if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda: Using double precision for forces\n"); +#endif*/ + +//if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda: Using single precision\n"); + + struct dev_array* dev_tmp=new struct dev_array[20]; +int n_cudata=0; + + + memory->create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:density_brick"); + memory->create3d_offset(density_brick_int,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:density_brick_int"); + + + cu_density_brick = new cCudaData ((double*) &(density_brick[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]), + (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1)); + + cu_density_brick_int = new cCudaData ((int*) &(density_brick_int[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]), + (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1)); + + memory->create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:vdx_brick"); + memory->create3d_offset(vdx_brick_tmp,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:vdx_brick_tmp"); + + cu_vdx_brick = new cCudaData ((double*) &(vdx_brick[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]), + (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1)); + + memory->create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:vdy_brick"); + cu_vdy_brick = new cCudaData ((double*) &(vdy_brick[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]), + (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1)); + + memory->create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:vdz_brick"); + cu_vdz_brick = new cCudaData ((double*) &(vdz_brick[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]), + (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1)); + + memory->create(density_fft,nfft_both,"pppm:density_fft"); + + cu_density_fft = new cCudaData (density_fft, & (dev_tmp[n_cudata++]),nfft_both); + + cu_energy = new cCudaData (NULL, &(dev_tmp[n_cudata++]),ny_pppm*nz_pppm); + cu_virial = new cCudaData (NULL, &(dev_tmp[n_cudata++]),ny_pppm*nz_pppm*6); + + memory->create(greensfn,nfft_both,"pppm:greensfn"); + cu_greensfn = new cCudaData (greensfn, & (dev_tmp[n_cudata++]) , nx_pppm*ny_pppm*nz_pppm); + + memory->create(work1,2*nx_pppm*ny_pppm*nz_pppm,"pppm:work1"); + memory->create(work2,2*nx_pppm*ny_pppm*nz_pppm,"pppm:work2"); + memory->create(work3,2*nx_pppm*ny_pppm*nz_pppm,"pppm:work3"); + + cu_work1 = new cCudaData (work1, & (dev_tmp[n_cudata++]) , 2*nx_pppm*ny_pppm*nz_pppm); + cu_work2 = new cCudaData (work2, & (dev_tmp[n_cudata++]) , 2*nx_pppm*ny_pppm*nz_pppm); + cu_work3 = new cCudaData (work3, & (dev_tmp[n_cudata++]) , 2*nx_pppm*ny_pppm*nz_pppm); + + + memory->create(fkx,nx_pppm,"pppmcuda:fkx"); + cu_fkx = new cCudaData (fkx, & (dev_tmp[n_cudata++]) , nx_pppm); + memory->create(fky,ny_pppm,"pppmcuda:fky"); + cu_fky = new cCudaData (fky, & (dev_tmp[n_cudata++]) , ny_pppm); + memory->create(fkz,nz_pppm,"pppmcuda:fkz"); + cu_fkz = new cCudaData (fkz, & (dev_tmp[n_cudata++]) , nz_pppm); + + memory->create(vg,nfft_both,6,"pppm:vg"); + + cu_vg = new cCudaData ((double*)vg, & (dev_tmp[n_cudata++]) , nfft_both,6); + + memory->create(buf1,nbuf,"pppm:buf1"); + memory->create(buf2,nbuf,"pppm:buf2"); + + + // summation coeffs + + + gf_b = new double[order]; + cu_gf_b = new cCudaData (gf_b, &(dev_tmp[n_cudata++]) , order); + memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d"); + memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff"); + + cu_rho_coeff = new cCudaData ((double*) &(rho_coeff[0][(1-order)/2]), & (dev_tmp[n_cudata++]) , order*(order/2-(1-order)/2+1)); + + debugdata=new PPPM_FLOAT[100]; + cu_debugdata = new cCudaData (debugdata,& (dev_tmp[n_cudata++]),100); + cu_flag = new cCudaData (&global_flag,& (dev_tmp[n_cudata++]),3); + + // create 2 FFTs and a Remap + // 1st FFT keeps data in FFT decompostion + // 2nd FFT returns data in 3d brick decomposition + // remap takes data from 3d brick to FFT decomposition + + int tmp; + + + + + fft1c = new FFT3dCuda(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + 0,0,&tmp,true); + + fft2c = new FFT3dCuda(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + 0,0,&tmp,false); + + +#ifdef FFT_CUFFT + fft1c->set_cudata(cu_work2->dev_data(),cu_work1->dev_data()); + fft2c->set_cudata(cu_work2->dev_data(),cu_work3->dev_data()); +#endif + + remap = new Remap(lmp,world, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + 1,0,0,2); + + +pppm_device_init(cu_density_brick->dev_data(), cu_vdx_brick->dev_data(), cu_vdy_brick->dev_data(), cu_vdz_brick->dev_data(), cu_density_fft->dev_data(),cu_energy->dev_data(),cu_virial->dev_data() + , cu_work1->dev_data(), cu_work2->dev_data(), cu_work3->dev_data(), cu_greensfn->dev_data(), cu_fkx->dev_data(), cu_fky->dev_data(), cu_fkz->dev_data(), cu_vg->dev_data() + ,nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,nx_pppm,ny_pppm,nz_pppm + ,nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,cu_gf_b->dev_data() + ,qqrd2e,order,cu_rho_coeff->dev_data(),cu_debugdata->dev_data(),cu_density_brick_int->dev_data(),slabflag + ); +} + + + +/* ---------------------------------------------------------------------- + deallocate memory that depends on # of K-vectors and order + ---------------------------------------------------------------------- */ + +void PPPMCuda::deallocate() +{ + memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out); + + density_brick = vdx_brick = vdy_brick = vdz_brick = NULL; + + memory->destroy(density_fft); + memory->destroy(greensfn); + memory->destroy(work1); + memory->destroy(work2); + memory->destroy(vg); + + density_fft = NULL; + greensfn = NULL; + work1 = NULL; + work2 = NULL; + vg = NULL; + + memory->destroy(fkx); + memory->destroy(fky); + memory->destroy(fkz); + + fkx = NULL; + fky = NULL; + fkz = NULL; + + delete cu_density_brick; + delete cu_density_brick_int; + delete cu_vdx_brick; + delete cu_vdy_brick; + delete cu_vdz_brick; + delete cu_density_fft; + delete cu_energy; + delete cu_virial; +#ifdef FFT_CUFFT + delete cu_greensfn; + delete cu_gf_b; + delete cu_vg; + delete cu_work1; + delete cu_work2; + delete cu_work3; + delete cu_fkx; + delete cu_fky; + delete cu_fkz; +#endif + + delete cu_flag; + delete cu_debugdata; + delete cu_rho_coeff; + + + cu_vdx_brick = cu_vdy_brick = cu_vdz_brick = NULL; + cu_density_brick = NULL; + cu_density_brick_int = NULL; + cu_density_fft = NULL; + cu_energy=NULL; + cu_virial=NULL; +#ifdef FFT_CUFFT + cu_greensfn = NULL; + cu_gf_b = NULL; + cu_work1 = cu_work2 = cu_work3 = NULL; + cu_vg = NULL; + cu_fkx = cu_fky = cu_fkz = NULL; +#endif + + cu_flag = NULL; + cu_debugdata = NULL; + cu_rho_coeff = NULL; + cu_part2grid = NULL; + + memory->destroy(buf1); + memory->destroy(buf2); + + delete [] gf_b; + gf_b = NULL; + memory->destroy2d_offset(rho1d,-order/2); rho1d = NULL; + memory->destroy2d_offset(rho_coeff,(1-order)/2); rho_coeff = NULL; + + delete fft1c; + fft1c = NULL; + double end=CudaWrapper_CheckMemUseage()/1024/1024; + delete fft2c; + fft2c = NULL; + delete remap; + remap = NULL; + buf1 = NULL; + buf2 = NULL; +} + +/* ---------------------------------------------------------------------- + set size of FFT grid (nx,ny,nz_pppm) and g_ewald +-------------------------------------------------------------------------*/ + +void PPPMCuda::set_grid() +{ + // see JCP 109, pg 7698 for derivation of coefficients + // higher order coefficients may be computed if needed + + double **acons; + memory->create(acons,8,7,"pppm:acons"); + + acons[1][0] = 2.0 / 3.0; + acons[2][0] = 1.0 / 50.0; + acons[2][1] = 5.0 / 294.0; + acons[3][0] = 1.0 / 588.0; + acons[3][1] = 7.0 / 1440.0; + acons[3][2] = 21.0 / 3872.0; + acons[4][0] = 1.0 / 4320.0; + acons[4][1] = 3.0 / 1936.0; + acons[4][2] = 7601.0 / 2271360.0; + acons[4][3] = 143.0 / 28800.0; + acons[5][0] = 1.0 / 23232.0; + acons[5][1] = 7601.0 / 13628160.0; + acons[5][2] = 143.0 / 69120.0; + acons[5][3] = 517231.0 / 106536960.0; + acons[5][4] = 106640677.0 / 11737571328.0; + acons[6][0] = 691.0 / 68140800.0; + acons[6][1] = 13.0 / 57600.0; + acons[6][2] = 47021.0 / 35512320.0; + acons[6][3] = 9694607.0 / 2095994880.0; + acons[6][4] = 733191589.0 / 59609088000.0; + acons[6][5] = 326190917.0 / 11700633600.0; + acons[7][0] = 1.0 / 345600.0; + acons[7][1] = 3617.0 / 35512320.0; + acons[7][2] = 745739.0 / 838397952.0; + acons[7][3] = 56399353.0 / 12773376000.0; + acons[7][4] = 25091609.0 / 1560084480.0; + acons[7][5] = 1755948832039.0 / 36229939200000.0; + acons[7][6] = 4887769399.0 / 37838389248.0; + + double q2 = qsqsum / force->dielectric; + bigint natoms = atom->natoms; + + // use xprd,yprd,zprd even if triclinic so grid size is the same + // adjust z dimension for 2d slab PPPMCuda + // 3d PPPMCuda just uses zprd since slab_volfactor = 1.0 + + double xprd = domain->xprd; + double yprd = domain->yprd; + double zprd = domain->zprd; + double zprd_slab = zprd*slab_volfactor; + + // make initial g_ewald estimate + // based on desired error and real space cutoff + // fluid-occupied volume used to estimate real-space error + // zprd used rather than zprd_slab + + double hx,hy,hz; + + if (!gewaldflag) + g_ewald = sqrt(-log(precision*sqrt(natoms*cutoff*xprd*yprd*zprd) / + (2.0*q2))) / cutoff; + + // set optimal nx_pppm,ny_pppm,nz_pppm based on order and precision + // nz_pppm uses extended zprd_slab instead of zprd + // h = 1/g_ewald is upper bound on h such that h*g_ewald <= 1 + // reduce it until precision target is met + + if (!gridflag) { + double err; + hx = hy = hz = 1/g_ewald; + + nx_pppm = static_cast (xprd/hx + 1); + ny_pppm = static_cast (yprd/hy + 1); + nz_pppm = static_cast (zprd_slab/hz + 1); + + err = rms(hx,xprd,natoms,q2,acons); + while (err > precision) { + err = rms(hx,xprd,natoms,q2,acons); + nx_pppm++; + hx = xprd/nx_pppm; + } + + err = rms(hy,yprd,natoms,q2,acons); + while (err > precision) { + err = rms(hy,yprd,natoms,q2,acons); + ny_pppm++; + hy = yprd/ny_pppm; + } + + err = rms(hz,zprd_slab,natoms,q2,acons); + while (err > precision) { + err = rms(hz,zprd_slab,natoms,q2,acons); + nz_pppm++; + hz = zprd_slab/nz_pppm; + } + } + + // boost grid size until it is factorable + + while (!factorable(nx_pppm)) nx_pppm++; + while (!factorable(ny_pppm)) ny_pppm++; + while (!factorable(nz_pppm)) nz_pppm++; + + // if allowed try to change grid size until it is a power of a single prime factor + if(precisionmodify!='=') + { + if (me == 0) { + if (screen) { + fprintf(screen,"Initial grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); + } + if (logfile) { + fprintf(logfile,"Initial grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); + } + } + make_power_of_prime(&nx_pppm); + make_power_of_prime(&ny_pppm); + make_power_of_prime(&nz_pppm); + if (me == 0) { + if (screen) { + fprintf(screen,"Modified grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); + } + if (logfile) { + fprintf(logfile,"Modified grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); + } + } + } + + // adjust g_ewald for new grid size + + hx = xprd/nx_pppm; + hy = yprd/ny_pppm; + hz = zprd_slab/nz_pppm; + + if (!gewaldflag) { + double gew1,gew2,dgew,f,fmid,hmin,rtb; + int ncount; + + gew1 = 0.0; + g_ewald = gew1; + f = diffpr(hx,hy,hz,q2,acons); + + hmin = MIN(hx,MIN(hy,hz)); + gew2 = 10/hmin; + g_ewald = gew2; + fmid = diffpr(hx,hy,hz,q2,acons); + + if (f*fmid >= 0.0) error->all("Cannot compute PPPMCuda G"); + rtb = f < 0.0 ? (dgew=gew2-gew1,gew1) : (dgew=gew1-gew2,gew2); + ncount = 0; + while (fabs(dgew) > SMALL && fmid != 0.0) { + dgew *= 0.5; + g_ewald = rtb + dgew; + fmid = diffpr(hx,hy,hz,q2,acons); + if (fmid <= 0.0) rtb = g_ewald; + ncount++; + if (ncount > LARGE) error->all("Cannot compute PPPMCuda G"); + } + } + + // final RMS precision + + double lprx = rms(hx,xprd,natoms,q2,acons); + double lpry = rms(hy,yprd,natoms,q2,acons); + double lprz = rms(hz,zprd_slab,natoms,q2,acons); + double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0); + double spr = 2.0*q2 * exp(-g_ewald*g_ewald*cutoff*cutoff) / + sqrt(natoms*cutoff*xprd*yprd*zprd_slab); + + // free local memory + + memory->destroy(acons); + + // print info + + if (me == 0) { + if (screen) { + fprintf(screen," G vector = %g\n",g_ewald); + fprintf(screen," grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); + fprintf(screen," stencil order = %d\n",order); + fprintf(screen," RMS precision = %g\n",MAX(lpr,spr)); + } + if (logfile) { + fprintf(logfile," G vector = %g\n",g_ewald); + fprintf(logfile," grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); + fprintf(logfile," stencil order = %d\n",order); + fprintf(logfile," RMS precision = %g\n",MAX(lpr,spr)); + } + } +} + + +/* ---------------------------------------------------------------------- + check if all factors of n are prime + return 1 if yes, 0 if no +-------------------------------------------------------------------------*/ + +void PPPMCuda::make_power_of_prime(int* n) +{ + + if((precisionmodify!='+')&&(precisionmodify!='-')&&(precisionmodify!='c')) + {error->all("Unknown Option for PPPMCuda, assumeing '='");return;} + int oldn=*n; + int* primelist=new int[1000]; + int count=0; + + int prime=1; + while(prime<2000) primelist[count++]=prime*=2; + prime=1; + while(prime<2000) primelist[count++]=prime*=3; + prime=1; + while(prime<2000) primelist[count++]=prime*=5; + prime=1; + while(prime<2000) primelist[count++]=prime*=7; + + for(int i=0;iprimelist[j+1]) + { + int a=primelist[j+1]; + primelist[j+1]=primelist[j]; + primelist[j]=a; + } + } + + int nextsmaller=0; + while((primelist[nextsmaller+1]<*n)&&(nextsmaller+1*n)&&(nextlarger>0)) nextlarger--; + + if(precisionmodify=='-') + *n=primelist[nextsmaller]; + if((precisionmodify=='+')&& + (primelist[nextlarger]*primelist[nextlarger]*primelist[nextlarger]<2*(*n)*(*n)*(*n))) + *n=primelist[nextlarger]; + if(precisionmodify=='c') + { + double factorsmaller=1.0*(*n)*(*n)*(*n)/(primelist[nextsmaller]*primelist[nextsmaller]*primelist[nextsmaller]); + double factorlarger=1.0*(primelist[nextlarger]*primelist[nextlarger]*primelist[nextlarger])/((*n)*(*n)*(*n)); + if((factorlargermemset_device(0); + flag=cuda_particle_map(&cuda->shared_data,cu_flag->dev_data()); + if(flag) + { + cu_debugdata->download(); + printf("Out of range atom: "); + printf("ID: %i ",atom->tag[int(debugdata[0])]); + printf("x: %e ",debugdata[7]); + printf("y: %e ",debugdata[8]); + printf("z: %e ",debugdata[9]); + printf("nx: %e ",debugdata[4]); + printf("ny: %e ",debugdata[5]); + + printf("\n"); + //printf("debugdata: cpu: %e %e %e %i\n",boxlo[0],boxlo[1],boxlo[2],atom->nlocal); + cuda->cu_x->download(); + int nx,ny,nz; + + double **x = atom->x; + int nlocal = atom->nlocal; + for (int i = 0; i < nlocal; i++) { + nx = static_cast ((x[i][0]-boxlo[0])*delxinv+shift) - OFFSET; + ny = static_cast ((x[i][1]-boxlo[1])*delyinv+shift) - OFFSET; + nz = static_cast ((x[i][2]-boxlo[2])*delzinv+shift) - OFFSET; + + if(i==1203)printf("Outside Atom: %i %e %e %e (%i %i %i)\n",i,x[i][0],x[i][1],x[i][2],nx,ny,nz); + if (nx+nlower < nxlo_out || nx+nupper > nxhi_out || + ny+nlower < nylo_out || ny+nupper > nyhi_out || + nz+nlower < nzlo_out || nz+nupper > nzhi_out || i==1203) {printf("Outside Atom: %i %e %e %e (%i %i %i)\n",i,x[i][0],x[i][1],x[i][2],nx,ny,nz); } + } + + } + + int flag_all; + MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world); + if (flag_all) error->all("Out of range atoms - cannot compute PPPMCuda!"); +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = charge "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid +------------------------------------------------------------------------- */ + + +void PPPMCuda::make_rho() +{ + cuda_make_rho(&cuda->shared_data,cu_flag->dev_data(),&density_intScale,nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,cu_density_brick->dev_data(),cu_density_brick_int->dev_data()); +} + + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver +------------------------------------------------------------------------- */ +void PPPMCuda::poisson(int eflag, int vflag) +{ + +#ifndef FFT_CUFFT + PPPM::poisson(eflag,vflag); + return; +#endif +#ifdef FFT_CUFFT + timespec starttime,starttime2; + timespec endtime,endtime2; + + int nprocs=comm->nprocs; + clock_gettime(CLOCK_REALTIME,&starttime); + fft1c->compute(density_fft,work1,1); + + clock_gettime(CLOCK_REALTIME,&endtime); + poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); + + + + if (eflag || vflag) { + poisson_energy(nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,vflag); + ENERGY_FLOAT gpuvirial[6]; + energy+=sum_energy(cu_virial->dev_data(),cu_energy->dev_data(),nx_pppm,ny_pppm,nz_pppm,vflag,gpuvirial); + if(vflag) + { + for(int j=0;j<6;j++) virial[j]+=gpuvirial[j]; + } + } + + + // scale by 1/total-grid-pts to get rho(k) + // multiply by Green's function to get V(k) + + poisson_scale(nx_pppm,ny_pppm,nz_pppm); + + // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k) + // FFT leaves data in 3d brick decomposition + // copy it into inner portion of vdx,vdy,vdz arrays + + // x direction gradient + + + poisson_xgrad(nx_pppm,ny_pppm,nz_pppm); + + + clock_gettime(CLOCK_REALTIME,&starttime); + fft2c->compute(work2,work2,-1); + clock_gettime(CLOCK_REALTIME,&endtime); + poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); + + poisson_vdx_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm); + + + // y direction gradient + + poisson_ygrad(nx_pppm,ny_pppm,nz_pppm); + + clock_gettime(CLOCK_REALTIME,&starttime); + fft2c->compute(work2,work2,-1); + clock_gettime(CLOCK_REALTIME,&endtime); + poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); + + poisson_vdy_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm); + + // z direction gradient + + poisson_zgrad(nx_pppm,ny_pppm,nz_pppm); + + clock_gettime(CLOCK_REALTIME,&starttime); + fft2c->compute(work2,work2,-1); + clock_gettime(CLOCK_REALTIME,&endtime); + poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); + + poisson_vdz_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm); + #endif +} + +/*---------------------------------------------------------------------- + interpolate from grid to get electric field & force on my particles +-------------------------------------------------------------------------*/ + +void PPPMCuda::fieldforce() +{ + cuda_fieldforce(& cuda->shared_data,cu_flag); + return; +} + + + + +/* ---------------------------------------------------------------------- + perform and time the 4 FFTs required for N timesteps +------------------------------------------------------------------------- */ + +void PPPMCuda::timing(int n, double &time3d, double &time1d) +{ + + double time1,time2; + + for (int i = 0; i < 2*nfft_both; i++) work1[i] = 0.0; + + MPI_Barrier(world); + time1 = MPI_Wtime(); + + for (int i = 0; i < n; i++) { + fft1c->compute(work1,work1,1); + fft2c->compute(work1,work1,-1); + fft2c->compute(work1,work1,-1); + fft2c->compute(work1,work1,-1); + } + + MPI_Barrier(world); + time2 = MPI_Wtime(); + time3d = time2 - time1; + + MPI_Barrier(world); + /*time1 = MPI_Wtime(); + + for (int i = 0; i < n; i++) { + fft1c->timing1d(work1,nfft_both,1); + fft2c->timing1d(work1,nfft_both,-1); + fft2c->timing1d(work1,nfft_both,-1); + fft2c->timing1d(work1,nfft_both,-1); + } + + MPI_Barrier(world); + time2 = MPI_Wtime(); + time1d = time2 - time1;*/ + +} + +void PPPMCuda::slabcorr(int eflag) +{ + // compute local contribution to global dipole moment + if(slabbuf==NULL) + { + slabbuf=new ENERGY_FLOAT[(atom->nmax+31)/32]; + cu_slabbuf = new cCudaData (slabbuf, (atom->nmax+31)/32); + } + if((atom->nlocal+31)/32*sizeof(ENERGY_FLOAT)>=cu_slabbuf->dev_size()) + { + delete [] slabbuf; + delete cu_slabbuf; + slabbuf=new ENERGY_FLOAT[(atom->nmax+31)/32]; + cu_slabbuf = new cCudaData (slabbuf, (atom->nmax+31)/32); + } + + + double dipole = cuda_slabcorr_energy(&cuda->shared_data,slabbuf,(ENERGY_FLOAT*) cu_slabbuf->dev_data()); + + double dipole_all; + MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world); + + // compute corrections + + double e_slabcorr = 2.0*PI*dipole_all*dipole_all/volume; + + if (eflag) energy += qqrd2e*scale * e_slabcorr; + + double ffact = -4.0*PI*dipole_all/volume; + + cuda_slabcorr_force(&cuda->shared_data,ffact); +} diff --git a/src/USER-CUDA/pppm_cuda.cu b/src/USER-CUDA/pppm_cuda.cu new file mode 100644 index 0000000000..cabea885d3 --- /dev/null +++ b/src/USER-CUDA/pppm_cuda.cu @@ -0,0 +1,579 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_precision.h" +//#define FFT_CUFFT +#define MY_PREFIX pppm +#include "cuda_shared.h" +#include "cuda_common.h" +#include "pppm_cuda_cu.h" +#include "cuda_runtime.h" +#include + +//#include "crm_cuda_utils.cu" +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + + __device__ __constant__ FFT_FLOAT* work1; + __device__ __constant__ FFT_FLOAT* work2; + __device__ __constant__ FFT_FLOAT* work3; + __device__ __constant__ PPPM_FLOAT* greensfn; + __device__ __constant__ PPPM_FLOAT* gf_b; + __device__ __constant__ PPPM_FLOAT* fkx; + __device__ __constant__ PPPM_FLOAT* fky; + __device__ __constant__ PPPM_FLOAT* fkz; + __device__ __constant__ PPPM_FLOAT* vg; + __device__ __constant__ int* part2grid; + __device__ __constant__ PPPM_FLOAT* density_brick; + __device__ __constant__ int* density_brick_int; + __device__ __constant__ PPPM_FLOAT density_intScale; + __device__ __constant__ PPPM_FLOAT* vdx_brick; + __device__ __constant__ PPPM_FLOAT* vdy_brick; + __device__ __constant__ PPPM_FLOAT* vdz_brick; + __device__ __constant__ PPPM_FLOAT* density_fft; + __device__ __constant__ ENERGY_FLOAT* energy; + __device__ __constant__ ENERGY_FLOAT* virial; + __device__ __constant__ int nxlo_in; + __device__ __constant__ int nxhi_in; + __device__ __constant__ int nxlo_out; + __device__ __constant__ int nxhi_out; + __device__ __constant__ int nylo_in; + __device__ __constant__ int nyhi_in; + __device__ __constant__ int nylo_out; + __device__ __constant__ int nyhi_out; + __device__ __constant__ int nzlo_in; + __device__ __constant__ int nzhi_in; + __device__ __constant__ int nzlo_out; + __device__ __constant__ int nzhi_out; + __device__ __constant__ int nxlo_fft; + __device__ __constant__ int nxhi_fft; + __device__ __constant__ int nylo_fft; + __device__ __constant__ int nyhi_fft; + __device__ __constant__ int nzlo_fft; + __device__ __constant__ int nzhi_fft; + __device__ __constant__ int nx_pppm; + __device__ __constant__ int ny_pppm; + __device__ __constant__ int nz_pppm; + __device__ __constant__ int slabflag; + __device__ __constant__ PPPM_FLOAT qqrd2e; + __device__ __constant__ int order; + //__device__ __constant__ float3 sublo; + __device__ __constant__ PPPM_FLOAT* rho_coeff; + __device__ __constant__ int nmax; + __device__ __constant__ int nlocal; + __device__ __constant__ PPPM_FLOAT* debugdata; + __device__ __constant__ PPPM_FLOAT delxinv; + __device__ __constant__ PPPM_FLOAT delyinv; + __device__ __constant__ PPPM_FLOAT delzinv; + __device__ __constant__ int nlower; + __device__ __constant__ int nupper; + __device__ __constant__ PPPM_FLOAT shiftone; + + +#include "pppm_cuda_kernel.cu" +#include "stdio.h" +void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial + ,void* cu_work1,void* cu_work2, void* cu_work3,void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg + ,int cu_nxlo_in,int cu_nxhi_in,int cu_nylo_in,int cu_nyhi_in,int cu_nzlo_in,int cu_nzhi_in,int cu_nxlo_out,int cu_nxhi_out,int cu_nylo_out,int cu_nyhi_out,int cu_nzlo_out,int cu_nzhi_out,int cu_nx_pppm,int cu_ny_pppm,int cu_nz_pppm + ,int cu_nxlo_fft,int cu_nxhi_fft,int cu_nylo_fft,int cu_nyhi_fft,int cu_nzlo_fft,int cu_nzhi_fft,void* cu_gf_b + ,double cu_qqrd2e, int cu_order, void* cu_rho_coeff,void* cu_debugdata,void* cu_density_brick_int,int cu_slabflag + ) +{ + CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start"); + cudaMemcpyToSymbol("density_brick",&cu_density_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("density_brick_int",&cu_density_brick_int, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("vdx_brick",&cu_vdx_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("vdy_brick",&cu_vdy_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("vdz_brick",&cu_vdz_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("density_fft",&cu_density_fft, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("energy",&cu_energy, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol("virial",&cu_virial, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol("nxlo_in",&cu_nxlo_in, sizeof(int)); + cudaMemcpyToSymbol("nxhi_in",&cu_nxhi_in, sizeof(int)); + cudaMemcpyToSymbol("nxlo_out",&cu_nxlo_out, sizeof(int)); + cudaMemcpyToSymbol("nxhi_out",&cu_nxhi_out, sizeof(int)); + cudaMemcpyToSymbol("nylo_in",&cu_nylo_in, sizeof(int)); + cudaMemcpyToSymbol("nyhi_in",&cu_nyhi_in, sizeof(int)); + cudaMemcpyToSymbol("nylo_out",&cu_nylo_out, sizeof(int)); + cudaMemcpyToSymbol("nyhi_out",&cu_nyhi_out, sizeof(int)); + cudaMemcpyToSymbol("nzlo_in",&cu_nzlo_in, sizeof(int)); + cudaMemcpyToSymbol("nzhi_in",&cu_nzhi_in, sizeof(int)); + cudaMemcpyToSymbol("nzlo_out",&cu_nzlo_out, sizeof(int)); + cudaMemcpyToSymbol("nzhi_out",&cu_nzhi_out, sizeof(int)); + cudaMemcpyToSymbol("nxlo_fft",&cu_nxlo_fft, sizeof(int)); + cudaMemcpyToSymbol("nxhi_fft",&cu_nxhi_fft, sizeof(int)); + cudaMemcpyToSymbol("nylo_fft",&cu_nylo_fft, sizeof(int)); + cudaMemcpyToSymbol("nyhi_fft",&cu_nyhi_fft, sizeof(int)); + cudaMemcpyToSymbol("nzlo_fft",&cu_nzlo_fft, sizeof(int)); + cudaMemcpyToSymbol("nzhi_fft",&cu_nzhi_fft, sizeof(int)); + cudaMemcpyToSymbol("slabflag",&cu_slabflag, sizeof(int)); + cudaMemcpyToSymbol("nx_pppm",&cu_nx_pppm, sizeof(int)); + cudaMemcpyToSymbol("ny_pppm",&cu_ny_pppm, sizeof(int)); + cudaMemcpyToSymbol("nz_pppm",&cu_nz_pppm, sizeof(int)); + cudaMemcpyToSymbol("work1",&cu_work1, sizeof(FFT_FLOAT*)); + cudaMemcpyToSymbol("work2",&cu_work2, sizeof(FFT_FLOAT*)); + cudaMemcpyToSymbol("work3",&cu_work3, sizeof(FFT_FLOAT*)); + cudaMemcpyToSymbol("greensfn",&cu_greensfn, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("gf_b",&cu_gf_b, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("fkx",&cu_fkx, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("fky",&cu_fky, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("fkz",&cu_fkz, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("vg",&cu_vg, sizeof(PPPM_FLOAT*)); + + PPPM_FLOAT cu_qqrd2e_a=cu_qqrd2e; + cudaMemcpyToSymbol("qqrd2e",&cu_qqrd2e_a, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol("order",&cu_order, sizeof(int)); + cudaMemcpyToSymbol("rho_coeff",&cu_rho_coeff, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol("debugdata",&cu_debugdata, sizeof(PPPM_FLOAT*)); + + CUT_CHECK_ERROR("ERROR-CUDA poisson_init"); + +/*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n"); + +#ifdef PPPM_PRECISION +if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n"); +if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n"); +#endif +#ifdef ENERGY_PRECISION +if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n"); +if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n"); +#endif +#ifdef ENERGY_PRECISION +if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n"); +if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n"); +#endif +#ifdef X_PRECISION +if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n"); +if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n"); +#endif +#ifdef F_PRECISION +if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n"); +if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n"); +#endif*/ +} + +void pppm_device_init_setup(cuda_shared_data* sdata,PPPM_FLOAT cu_shiftone,PPPM_FLOAT cu_delxinv,PPPM_FLOAT cu_delyinv,PPPM_FLOAT cu_delzinv,int cu_nlower,int cu_nupper) +{ + cudaMemcpyToSymbol("delxinv",&cu_delxinv, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol("delyinv",&cu_delyinv, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol("delzinv",&cu_delzinv, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol("shiftone",&cu_shiftone, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol("nlower",&cu_nlower, sizeof(int)); + cudaMemcpyToSymbol("nupper",&cu_nupper, sizeof(int)); + cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo, 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi, 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_CONST(boxlo) , sdata->domain.boxlo, 3*sizeof(X_FLOAT)); + CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup"); +} + +void pppm_device_update(cuda_shared_data* sdata,void* cu_part2grid, int nlocala,int nmaxa) +{ + cudaMemcpyToSymbol("part2grid",&cu_part2grid, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + //cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int)); + cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int)); + cudaMemcpyToSymbol("nmax" , &nmaxa, sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update"); + +} + +void pppm_update_nlocal(int nlocala) +{ + cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b"); +} + + +void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald) +{ + dim3 grid; + dim3 threads; + grid.x=nz_pppma; + grid.y=ny_pppma; + grid.z=1; + threads.x=nx_pppma; + threads.y=1; + threads.z=1; + setup_fkxyz_vg<<>>(unitkx,unitky,unitkz,g_ewald); + cudaThreadSynchronize(); + + CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg "); +} + +void Cuda_PPPM_setup_greensfn(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald, +int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab) +{ + dim3 grid; + dim3 threads; + grid.x=nz_pppma; + grid.y=ny_pppma; + grid.z=1; + threads.x=nx_pppma; + threads.y=1; + threads.z=1; + setup_greensfn<<>>(unitkx,unitky,unitkz,g_ewald,nbx,nby,nbz,xprd,yprd, zprd_slab); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn "); +} + +void poisson_scale(int nx_pppma,int ny_pppma,int nz_pppma) +{ + dim3 grid; + dim3 threads; + grid.x=nz_pppma; + grid.y=ny_pppma; + grid.z=1; + threads.x=nx_pppma; + threads.y=1; + threads.z=1; + poisson_scale_kernel<<>>(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_scale "); + +} + +void poisson_xgrad(int nx_pppma,int ny_pppma,int nz_pppma) +{ + dim3 grid; + dim3 threads; + grid.x=nz_pppma; + grid.y=ny_pppma; + grid.z=1; + threads.x=nx_pppma; + threads.y=1; + threads.z=1; + poisson_xgrad_kernel<<>>(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad "); +} + +void poisson_ygrad(int nx_pppma,int ny_pppma,int nz_pppma) +{ + dim3 grid; + dim3 threads; + grid.x=nz_pppma; + grid.y=ny_pppma; + grid.z=1; + threads.x=nx_pppma; + threads.y=1; + threads.z=1; + poisson_ygrad_kernel<<>>(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad "); +} + +void poisson_zgrad(int nx_pppma,int ny_pppma,int nz_pppma) +{ + dim3 grid; + dim3 threads; + grid.x=nz_pppma; + grid.y=ny_pppma; + grid.z=1; + threads.x=nx_pppma; + threads.y=1; + threads.z=1; + poisson_zgrad_kernel<<>>(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad "); +} + +void poisson_vdx_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppma,int ny_pppma,int nz_pppma) +{ + + dim3 grid; + dim3 threads; + grid.x=khi-klo+1; + grid.y=jhi-jlo+1; + grid.z=1; + threads.x=ihi-ilo+1; + threads.y=1; + threads.z=1; + //printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x); + poisson_vdx_brick_kernel<<>>(ilo,jlo,klo); + CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick "); + cudaThreadSynchronize(); +} + +void poisson_vdy_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm) +{ + dim3 grid; + dim3 threads; + grid.x=khi-klo+1; + grid.y=jhi-jlo+1; + grid.z=1; + threads.x=ihi-ilo+1; + threads.y=1; + threads.z=1; + poisson_vdy_brick_kernel<<>>(ilo,jlo,klo); + CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick "); + cudaThreadSynchronize(); +} + +void poisson_vdz_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm) +{ + dim3 grid; + dim3 threads; + grid.x=khi-klo+1; + grid.y=jhi-jlo+1; + grid.z=1; + threads.x=ihi-ilo+1; + threads.y=1; + threads.z=1; + poisson_vdz_brick_kernel<<>>(ilo,jlo,klo); + CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick "); + cudaThreadSynchronize(); +} + + +void poisson_energy(int nxlo_fft,int nxhi_fft,int nylo_fft,int nyhi_fft,int nzlo_fft,int nzhi_fft,int vflag) +{ + //printf("VFLAG_GPU: %i\n",vflag); + CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start "); + dim3 grid; + dim3 threads; + grid.x=nzhi_fft-nzlo_fft+1; + grid.y=nyhi_fft-nylo_fft+1; + grid.z=1; + threads.x=nxhi_fft-nxlo_fft+1; + threads.y=1; + threads.z=1; + poisson_energy_kernel<<>>(nxlo_fft,nylo_fft,nzlo_fft,vflag); + + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end "); +} + +ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppma,int nz_pppma,int vflag,ENERGY_FLOAT* cpu_virial) +{ + ENERGY_FLOAT host_energy=0; + dim3 grid; + dim3 threads; + + grid.x=nz_pppma; + grid.y=1; + grid.z=1; + threads.x=ny_pppma; + threads.y=1; + threads.z=1; + sum_energy_kernel1<<>>(vflag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 "); + + grid.x=1; + grid.y=1; + grid.z=1; + threads.x=nz_pppma; + threads.y=1; + threads.z=1; + sum_energy_kernel2<<>>(vflag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 "); + + cudaMemcpy((void*) (&host_energy), cu_energy, sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); + if(vflag) + cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); + CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy"); + + return host_energy; +} + +void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_intScale,int ihi,int ilo,int jhi,int jlo,int khi,int klo,void* cu_density_brick,void* cu_density_brick_int) +{ + CUT_CHECK_ERROR("cuda_make_rho begin"); + dim3 grid,threads; + int cpu_flag[3]; + grid.x=(sdata->atom.nlocal+31)/32; + grid.y=1; + grid.z=1; + threads.x=32; + threads.y=1; + threads.z=1; + int sharedmemsize=(32+32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT); + do + { + cpu_flag[0]=0; + cpu_flag[1]=0; + cpu_flag[2]=0; + cudaMemcpyToSymbol("density_intScale",cu_density_intScale,sizeof(PPPM_FLOAT*)); + CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z"); + cudaMemset(flag,0,3*sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A"); + cudaMemset(cu_density_brick,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(PPPM_FLOAT)); + CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B"); + cudaMemset(cu_density_brick_int,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C"); + make_rho_kernel<<>>((int*) flag,32/(sdata->pppm.nupper-sdata->pppm.nlower+1)); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA make_rho A"); + cudaMemcpy((void*) &cpu_flag, flag, 3*sizeof(int),cudaMemcpyDeviceToHost); + if(cpu_flag[0]!=0) {(*cu_density_intScale)/=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n",*cu_density_intScale);)} + if((cpu_flag[0]==0)&&(cpu_flag[1]==0)) {(*cu_density_intScale)*=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n",*cu_density_intScale);)} + /* if((*cu_density_intScale)>0xe0000000) + { + printf("Error Scaling\n"); + cpu_flag[0]=0; + cpu_flag[1]=1; + }*/ + CUT_CHECK_ERROR("ERROR-CUDA make_rho B"); + } while((cpu_flag[0]!=0)||(cpu_flag[1]==0)); + + + grid.x=khi-klo+1; + grid.y=jhi-jlo+1; + threads.x=ihi-ilo+1; + scale_rho_kernel<<>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale"); +} + + +int cuda_particle_map(cuda_shared_data* sdata,void* flag) +{ + dim3 grid,threads; + int cpu_flag; + grid.x=(sdata->atom.nlocal+31)/32; + grid.y=1; + grid.z=1; + threads.x=32; + threads.y=1; + threads.z=1; + CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre"); + particle_map_kernel<<>>((int*) flag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA particla_map a"); + cudaMemcpy((void*) &cpu_flag, flag, sizeof(int),cudaMemcpyDeviceToHost); + CUT_CHECK_ERROR("ERROR-CUDA particla_map b"); + return cpu_flag; +} + + +void cuda_fieldforce(cuda_shared_data* sdata,void* flag) +{ + dim3 grid,threads; + grid.x=(sdata->atom.nlocal+31)/32; + grid.y=1; + grid.z=1; + threads.x=32; + threads.y=1; + threads.z=1; + int sharedmemsize=(32+3*32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT); + fieldforce_kernel<<>> + (sdata->pppm.nupper-sdata->pppm.nlower+1,32/(sdata->pppm.nupper-sdata->pppm.nlower+1),(int*) flag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA fieldforce"); +} + +double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf) +{ + dim3 grid,threads; + grid.x=(sdata->atom.nlocal+31)/32; + grid.y=1; + grid.z=1; + threads.x=32; + threads.y=1; + threads.z=1; + slabcorr_energy_kernel<<>>(dev_buf); + cudaThreadSynchronize(); + cudaMemcpy((void*) buf, dev_buf, grid.x*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); + + double dipole_all=0.0; + for(int i=0;iatom.nlocal+31)/32; + grid.y=1; + grid.z=1; + threads.x=32; + threads.y=1; + threads.z=1; + slabcorr_force_kernel<<>>(ffact); + cudaThreadSynchronize(); +} + +void sum_virial(double* host_virial) +{ +} + +void pppm_initfftdata(cuda_shared_data* sdata,PPPM_FLOAT* in,FFT_FLOAT* out) +{ + int nslow=sdata->pppm.nzhi_in-sdata->pppm.nzlo_in; + int nmid=sdata->pppm.nyhi_in-sdata->pppm.nylo_in; + int nfast=sdata->pppm.nxhi_in-sdata->pppm.nxlo_in; + int nrimz=MAX(sdata->pppm.nzlo_in-sdata->pppm.nzlo_out,sdata->pppm.nzhi_out-sdata->pppm.nzhi_in); + int nrimy=MAX(sdata->pppm.nylo_in-sdata->pppm.nylo_out,sdata->pppm.nyhi_out-sdata->pppm.nyhi_in); + int nrimx=MAX(sdata->pppm.nxlo_in-sdata->pppm.nxlo_out,sdata->pppm.nxhi_out-sdata->pppm.nxhi_in); + dim3 grid; + grid.x=nslow+1; + grid.y=nmid+1; + grid.z=1; + dim3 threads; + threads.x=nfast+1; + threads.y=1; + threads.z=1; + cudaThreadSynchronize(); + initfftdata_core_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nrimz; + grid.y=nmid+1; + threads.x=nfast+1; + initfftdata_z_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nslow+1; + grid.y=nrimy; + threads.x=nfast+1; + initfftdata_y_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nslow+1; + grid.y=nmid+1; + threads.x=nrimx; + initfftdata_x_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nrimz; + grid.y=nrimy; + threads.x=nfast+1; + initfftdata_yz_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nrimz; + grid.y=nmid+1; + threads.x=nrimx; + initfftdata_xz_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nslow+1; + grid.y=nrimy; + threads.x=nrimx; + initfftdata_xy_kernel<<>>(in,out); + cudaThreadSynchronize(); + grid.x=nrimz; + grid.y=nrimy; + threads.x=nrimx; + initfftdata_xyz_kernel<<>>(in,out); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel"); +} + + diff --git a/src/USER-CUDA/pppm_cuda.h b/src/USER-CUDA/pppm_cuda.h new file mode 100644 index 0000000000..0becd762c0 --- /dev/null +++ b/src/USER-CUDA/pppm_cuda.h @@ -0,0 +1,114 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifdef KSPACE_CLASS + +KSpaceStyle(pppm/cuda,PPPMCuda) + +#else + +#ifndef LMP_PPPM_CUDA_H +#define LMP_PPPM_CUDA_H + +#include "pppm.h" +#include "cuda_data.h" +#include "cuda_precision.h" + +namespace LAMMPS_NS { + +class PPPMCuda : public PPPM { + public: + PPPMCuda(class LAMMPS *, int, char **); + ~PPPMCuda(); + void init(); + void setup(); + void compute(int, int); + void timing(int, double &, double &); + + double poissontime; + protected: + class Cuda *cuda; + class FFT3dCuda *fft1c,*fft2c; + double* work3; + + cCudaData* cu_work1; + cCudaData* cu_work2; + cCudaData* cu_work3; + cCudaData* cu_greensfn; + cCudaData* cu_gf_b; + cCudaData* cu_fkx; + cCudaData* cu_fky; + cCudaData* cu_fkz; + cCudaData* cu_vg; + cCudaData* cu_density_brick; + cCudaData* cu_density_brick_int; + cCudaData* cu_vdx_brick; + cCudaData* cu_vdy_brick; + cCudaData* cu_vdz_brick; + cCudaData* cu_density_fft; + cCudaData* cu_energy; + cCudaData* cu_virial; + cCudaData* cu_x; + cCudaData* cu_v; + cCudaData* cu_f; + cCudaData* cu_q; + cCudaData* cu_part2grid; + cCudaData* cu_rho_coeff; + cCudaData* cu_debugdata; + cCudaData* cu_flag; + cCudaData* cu_pppm_grid_n; + cCudaData* cu_pppm_grid_ids; + + ENERGY_FLOAT* slabbuf; + cCudaData* cu_slabbuf; + + int*** density_brick_int; + PPPM_FLOAT density_intScale; + int pppm_grid_nmax; + int* pppm2partgrid; + int* pppm_grid; + PPPM_FLOAT* debugdata; + bool firstpass; + + void set_grid(); + void make_power_of_prime(int* n); + void allocate(); + void deallocate(); + + virtual void particle_map(); + virtual void make_rho(); + void poisson(int, int); + virtual void fieldforce(); + virtual void slabcorr(int); + double*** vdx_brick_tmp; + int old_nmax; + int global_flag; + dev_array* adev_data_array; + char precisionmodify; + +}; + +} + +#endif +#endif diff --git a/src/USER-CUDA/verlet_cuda.cpp b/src/USER-CUDA/verlet_cuda.cpp index 259ae8815d..0183368407 100644 --- a/src/USER-CUDA/verlet_cuda.cpp +++ b/src/USER-CUDA/verlet_cuda.cpp @@ -61,6 +61,8 @@ using namespace LAMMPS_NS; VerletCuda::VerletCuda(LAMMPS *lmp, int narg, char **arg) : Verlet(lmp, narg, arg) { cuda = lmp->cuda; + if(cuda == NULL) + error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); modify_cuda=(ModifyCuda*) modify; }