diff --git a/lib/gpu/README b/lib/gpu/README index 73a51fc391..b6358170f8 100644 --- a/lib/gpu/README +++ b/lib/gpu/README @@ -83,7 +83,11 @@ NOTE: PPPM acceleration can only be run on GPUs with compute capability>=1.1. when attempting to run PPPM on a GPU with compute capability 1.0. NOTE: Double precision is only supported on certain GPUs (with - compute capability>=1.3). + compute capability>=1.3). If you compile the GPU library for + a GPU with compute capability 1.1 and 1.2, then only single + precistion FFTs are supported, i.e. LAMMPS has to be compiled + with -DFFT_SINGLE. For details on configuring FFT support in + LAMMPS, see http://lammps.sandia.gov/doc/Section_start.html#2_2_4 NOTE: For Tesla and other graphics cards with compute capability>=1.3, make sure that -arch=sm_13 is set on the CUDA_ARCH line. @@ -96,9 +100,8 @@ NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE NOTE: The cg/cmm/gpu and cg/cmm/coul/long/gpu pair styles will only be installed if the USER-CG-CMM package has been installed. -NOTE: The lj/cut/coul/long/gpu, cg/cmm/coul/long/gpu, pppm/gpu/single, and - pppm/gpu/double styles will only be installed if the KSPACE package has - been installed. +NOTE: The lj/cut/coul/long/gpu, cg/cmm/coul/long/gpu, and pppm/gpu styles + will only be installed if the KSPACE package has been installed. NOTE: The lj/charmm/coul/long will only be installed if the MOLECULE package has been installed. diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu index 39ae01cb78..e098efb7f3 100644 --- a/lib/gpu/cmmc_long_gpu_kernel.cu +++ b/lib/gpu/cmmc_long_gpu_kernel.cu @@ -200,10 +200,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); - } else { + } else forcecoul = (numtyp)0.0; - prefactor = (numtyp)0.0; - } force = (force_lj + forcecoul) * r2inv; @@ -212,7 +210,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, f.z+=delz*force; if (eflag>0) { - e_coul += prefactor*(_erfc-factor_coul); + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].y) { energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)- lj3[mtype].w; @@ -394,10 +393,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); - } else { + } else forcecoul = (numtyp)0.0; - prefactor = (numtyp)0.0; - } force = (force_lj + forcecoul) * r2inv; @@ -406,7 +403,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, f.z+=delz*force; if (eflag>0) { - e_coul += prefactor*(_erfc-factor_coul); + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].y) { energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)- lj3[mtype].w; diff --git a/lib/gpu/crml_gpu_kernel.cu b/lib/gpu/crml_gpu_kernel.cu index f2ba74c8b6..7722da9746 100644 --- a/lib/gpu/crml_gpu_kernel.cu +++ b/lib/gpu/crml_gpu_kernel.cu @@ -204,10 +204,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); - } else { + } else forcecoul = (numtyp)0.0; - prefactor = (numtyp)0.0; - } force = (force_lj + forcecoul) * r2inv; @@ -216,7 +214,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, f.z+=delz*force; if (eflag>0) { - e_coul += prefactor*(_erfc-factor_coul); + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); if (rsq < cut_ljsq) { numtyp e=r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w); if (rsq > cut_lj_innersq) @@ -407,10 +406,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); - } else { + } else forcecoul = (numtyp)0.0; - prefactor = (numtyp)0.0; - } force = (force_lj + forcecoul) * r2inv; @@ -419,7 +416,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, f.z+=delz*force; if (eflag>0) { - e_coul += prefactor*(_erfc-factor_coul); + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); if (rsq < cut_ljsq) { numtyp e=lj3-lj4; if (rsq > cut_lj_innersq) diff --git a/lib/gpu/lj_class2_long.cu b/lib/gpu/lj_class2_long.cu index 5fc7c408df..6bac225fa1 100644 --- a/lib/gpu/lj_class2_long.cu +++ b/lib/gpu/lj_class2_long.cu @@ -191,10 +191,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); - } else { + } else forcecoul = (numtyp)0.0; - prefactor = (numtyp)0.0; - } force = (force_lj + forcecoul) * r2inv; @@ -203,7 +201,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, f.z+=delz*force; if (eflag>0) { - e_coul += prefactor*(_erfc-factor_coul); + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].w) { numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); @@ -379,10 +378,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); - } else { + } else forcecoul = (numtyp)0.0; - prefactor = (numtyp)0.0; - } force = (force_lj + forcecoul) * r2inv; @@ -391,7 +388,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, f.z+=delz*force; if (eflag>0) { - e_coul += prefactor*(_erfc-factor_coul); + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].w) { numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu index e177f1a0fd..638304247f 100644 --- a/lib/gpu/ljcl_cut_gpu_kernel.cu +++ b/lib/gpu/ljcl_cut_gpu_kernel.cu @@ -191,10 +191,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); - } else { + } else forcecoul = (numtyp)0.0; - prefactor = (numtyp)0.0; - } force = (force_lj + forcecoul) * r2inv; @@ -203,7 +201,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, f.z+=delz*force; if (eflag>0) { - e_coul += prefactor*(_erfc-factor_coul); + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].w) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); @@ -377,10 +376,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); - } else { + } else forcecoul = (numtyp)0.0; - prefactor = (numtyp)0.0; - } force = (force_lj + forcecoul) * r2inv; @@ -389,7 +386,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, f.z+=delz*force; if (eflag>0) { - e_coul += prefactor*(_erfc-factor_coul); + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); if (rsq < lj1[mtype].w) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu index 4b50fbd930..93780f6980 100644 --- a/lib/gpu/pppm_gpu_kernel.cu +++ b/lib/gpu/pppm_gpu_kernel.cu @@ -144,7 +144,7 @@ __kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_, int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx; int old=atom_add(counts+i, 1); - if (old==max_atoms) { + if (old>=max_atoms) { *error=2; atom_add(counts+i, -1); } else diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp index 2f7b35d051..43e9f8c753 100644 --- a/lib/gpu/pppm_gpu_memory.cpp +++ b/lib/gpu/pppm_gpu_memory.cpp @@ -51,7 +51,7 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, - const int nzhi_out, double **rho_coeff, + const int nzhi_out, grdtyp **rho_coeff, grdtyp **vd_brick, const double slab_volfactor, const int nx_pppm, const int ny_pppm, const int nz_pppm, int &flag) { @@ -123,7 +123,7 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, int numel=order*( order/2 - n2lo + 1 ); success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)== UCL_SUCCESS); - UCL_H_Vec view; + UCL_H_Vec view; view.view(rho_coeff[0]+n2lo,numel,*ucl_device); ucl_copy(d_rho_coeff,view,true); _max_bytes+=d_rho_coeff.row_bytes(); diff --git a/lib/gpu/pppm_gpu_memory.h b/lib/gpu/pppm_gpu_memory.h index a6e57535bb..870c8aeb92 100644 --- a/lib/gpu/pppm_gpu_memory.h +++ b/lib/gpu/pppm_gpu_memory.h @@ -46,7 +46,7 @@ class PPPMGPUMemory { grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, - double **rho_coeff, grdtyp **vd_brick, + grdtyp **rho_coeff, grdtyp **vd_brick, const double slab_volfactor, const int nx_pppm, const int ny_pppm, const int nz_pppm, int &success); diff --git a/lib/gpu/pppm_l_gpu.cpp b/lib/gpu/pppm_l_gpu.cpp index 200d2f1685..c48462abf7 100644 --- a/lib/gpu/pppm_l_gpu.cpp +++ b/lib/gpu/pppm_l_gpu.cpp @@ -34,7 +34,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, - const int nzhi_out, double **rho_coeff, + const int nzhi_out, grdtyp **rho_coeff, grdtyp **vd_brick, const double slab_volfactor, const int nx_pppm, const int ny_pppm, const int nz_pppm, int &success) { @@ -95,7 +95,7 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, - const int nzhi_out, double **rho_coeff, + const int nzhi_out, float **rho_coeff, float **vd_brick, const double slab_volfactor, const int nx_pppm, const int ny_pppm, const int nz_pppm, int &success) {