Misc Improvements to GPU Package

- Optimizations for molecular systems
-   Improved kernel performance and greater CPU overlap
- Reduced GPU to CPU communications for discrete devices
- Switch classic Intel makefiles to use LLVM-based compilers
- Prefetch optimizations supported for OpenCL
- Optimized data repack for quaternions
This commit is contained in:
W. Michael Brown
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions

View File

@ -319,7 +319,7 @@ CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH,
THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR, THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD, BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES, BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
PPPM_MAX_SPLINE. PPPM_MAX_SPLINE, NBOR_PREFETCH.
CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX
(NVIDIA) or OpenCL extensions (Intel) should be used for horizontal (NVIDIA) or OpenCL extensions (Intel) should be used for horizontal

View File

@ -12,13 +12,12 @@ EXTRAMAKE = Makefile.lammps.opencl
LMP_INC = -DLAMMPS_SMALLBIG LMP_INC = -DLAMMPS_SMALLBIG
OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/ OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/
CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -fp-model fast=2 -no-prec-div \ CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -ffast-math -freciprocal-math
-qoverride-limits OCL_CPP = mpiicpc -cxx=icpx -std=c++11 -DMPICH_IGNORE_CXX_SEEK \
OCL_CPP = mpiicpc -std=c++11 -diag-disable=10441 -DMPICH_IGNORE_CXX_SEEK \
$(LMP_INC) $(OCL_INC) $(CPP_OPT) $(LMP_INC) $(OCL_INC) $(CPP_OPT)
OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
OCL_PREC = -D_SINGLE_DOUBLE OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -DGERYON_NO_OCL_MARKERS
BIN_DIR = ./ BIN_DIR = ./
OBJ_DIR = ./ OBJ_DIR = ./

View File

@ -0,0 +1,28 @@
# /* ----------------------------------------------------------------------
# Linux Makefile for Intel oneAPI - Mixed precision (with timing enabled)
# ------------------------------------------------------------------------- */
# which file will be copied to Makefile.lammps
EXTRAMAKE = Makefile.lammps.opencl
# this setting should match LAMMPS Makefile
# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
LMP_INC = -DLAMMPS_SMALLBIG
OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/
CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -ffast-math -freciprocal-math
OCL_CPP = mpiicpc -cxx=icpx -std=c++11 -DMPICH_IGNORE_CXX_SEEK \
$(LMP_INC) $(OCL_INC) $(CPP_OPT)
OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
BIN_DIR = ./
OBJ_DIR = ./
LIB_DIR = ./
AR = ar
BSH = /bin/sh
include Opencl.makefile

View File

@ -266,6 +266,7 @@ LAL_SERIALIZE_INIT Force serialization of initialization and compilation
for multiple MPI tasks sharing the same accelerator. for multiple MPI tasks sharing the same accelerator.
Some accelerator API implementations have had issues Some accelerator API implementations have had issues
with temporary file conflicts in the past. with temporary file conflicts in the past.
LAL_DISABLE_PREFETCH Disable prefetch in kernels
GERYON_FORCE_SHARED_MAIN_MEM_ON Should only be used for builds where the GERYON_FORCE_SHARED_MAIN_MEM_ON Should only be used for builds where the
accelerator is guaranteed to share physical accelerator is guaranteed to share physical
main memory with the host (e.g. integrated main memory with the host (e.g. integrated

View File

@ -429,7 +429,7 @@ void UCL_Device::clear() {
CU_SAFE_CALL_NS(cuCtxSetCurrent(_old_context)); CU_SAFE_CALL_NS(cuCtxSetCurrent(_old_context));
CU_SAFE_CALL_NS(cuDevicePrimaryCtxRelease(_cu_device)); CU_SAFE_CALL_NS(cuDevicePrimaryCtxRelease(_cu_device));
#else #else
cuCtxDestroy(_context)); cuCtxDestroy(_context);
#endif #endif
} }
_device=-1; _device=-1;

View File

@ -113,7 +113,7 @@ _texture( q_tex,int2);
dufld[5]=red_acc[5][tid]; \ dufld[5]=red_acc[5][tid]; \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 t; \ acctyp3 t; \
t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \ t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \
(numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \ (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \
t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \ t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \
@ -147,7 +147,7 @@ _texture( q_tex,int2);
_fieldp[5]=red_acc[5][tid]; \ _fieldp[5]=red_acc[5][tid]; \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 f, fp; \ acctyp3 f, fp; \
f.x = _fieldp[0]; \ f.x = _fieldp[0]; \
f.y = _fieldp[1]; \ f.y = _fieldp[1]; \
f.z = _fieldp[2]; \ f.z = _fieldp[2]; \
@ -174,7 +174,7 @@ _texture( q_tex,int2);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -254,7 +254,7 @@ _texture( q_tex,int2);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 t; \ acctyp3 t; \
t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \ t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \
(numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \ (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \
t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \ t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \
@ -277,7 +277,7 @@ _texture( q_tex,int2);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 f, fp; \ acctyp3 f, fp; \
f.x = _fieldp[0]; \ f.x = _fieldp[0]; \
f.y = _fieldp[1]; \ f.y = _fieldp[1]; \
f.z = _fieldp[2]; \ f.z = _fieldp[2]; \
@ -302,7 +302,7 @@ _texture( q_tex,int2);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -391,7 +391,7 @@ _texture( q_tex,int2);
if (t_per_atom>1) \ if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -416,9 +416,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
const __global int *dev_short_nbor, const __global int *dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global acctyp4 *restrict tep, __global acctyp3 *restrict tep,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch, const int nall, const int nbor_pitch,
const int t_per_atom, const numtyp aewald, const int t_per_atom, const numtyp aewald,
@ -431,7 +431,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_charge(); local_allocate_store_charge();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -440,9 +440,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
for (int l=0; l<6; l++) virial[l]=(acctyp)0; for (int l=0; l<6; l++) virial[l]=(acctyp)0;
} }
acctyp4 tq; acctyp3 tq;
tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
const __global numtyp4* polar1 = &extra[0]; const __global numtyp4* polar1 = &extra[0];
const __global numtyp4* polar2 = &extra[nall]; const __global numtyp4* polar2 = &extra[nall];
const __global numtyp4* polar3 = &extra[2*nall]; const __global numtyp4* polar3 = &extra[2*nall];
@ -695,7 +695,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
const __global int *dev_short_nbor, const __global int *dev_short_nbor,
__global acctyp4 *restrict fieldp, __global acctyp3 *restrict fieldp,
const int inum, const int nall, const int inum, const int nall,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
const numtyp aewald, const numtyp off2, const numtyp aewald, const numtyp off2,
@ -889,7 +889,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
const __global int *dev_short_nbor, const __global int *dev_short_nbor,
__global acctyp4 *restrict fieldp, __global acctyp3 *restrict fieldp,
const int inum, const int nall, const int inum, const int nall,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
const numtyp aewald, const numtyp off2, const numtyp aewald, const numtyp off2,
@ -1052,9 +1052,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
const __global int *dev_short_nbor, const __global int *dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global acctyp4 *restrict tep, __global acctyp3 *restrict tep,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch, const int t_per_atom, const int nall, const int nbor_pitch, const int t_per_atom,
const numtyp aewald, const numtyp felec, const numtyp aewald, const numtyp felec,
@ -1067,7 +1067,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_charge(); local_allocate_store_charge();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -1082,7 +1082,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
for (int l=0; l<6; l++) dufld[l]=(acctyp)0; for (int l=0; l<6; l++) dufld[l]=(acctyp)0;
numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
const __global numtyp4* polar1 = &extra[0]; const __global numtyp4* polar1 = &extra[0];
const __global numtyp4* polar2 = &extra[nall]; const __global numtyp4* polar2 = &extra[nall];
const __global numtyp4* polar3 = &extra[2*nall]; const __global numtyp4* polar3 = &extra[2*nall];
@ -1226,7 +1226,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
numtyp prc3[3],prc5[3],prc7[3]; numtyp prc3[3],prc5[3],prc7[3];
numtyp drc3[3],drc5[3],drc7[3]; numtyp drc3[3],drc5[3],drc7[3];
numtyp urc3[3],urc5[3]; numtyp urc3[3],urc5[3];
numtyp ralpha = aewald * r; numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp exp2a = ucl_exp(-ralpha*ralpha);
numtyp bn[5]; numtyp bn[5];
@ -1583,12 +1583,12 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
if (ii<inum) { if (ii<inum) {
const int nlpts = (bsorder-1) / 2; const int nlpts = (bsorder-1) / 2;
int istart = fast_mul(ii,4); int istart = fast_mul(ii,4);
const int igridx = igrid[istart]; const int igridx = igrid[istart];
const int igridy = igrid[istart+1]; const int igridy = igrid[istart+1];
const int igridz = igrid[istart+2]; const int igridz = igrid[istart+2];
// now istart is used to index thetai1, thetai2 and thetai3 // now istart is used to index thetai1, thetai2 and thetai3
istart = fast_mul(ii,bsorder); istart = fast_mul(ii,bsorder);
@ -1782,7 +1782,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
fdip_buf[7] = tuv110_1; fdip_buf[7] = tuv110_1;
fdip_buf[8] = tuv101_1; fdip_buf[8] = tuv101_1;
fdip_buf[9] = tuv011_1; fdip_buf[9] = tuv011_1;
idx = ii; idx = ii;
for (int m = 0; m < 10; m++) { for (int m = 0; m < 10; m++) {
fdip_phi1[idx] = fdip_buf[m]; fdip_phi1[idx] = fdip_buf[m];
idx += inum; idx += inum;
@ -1798,7 +1798,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
fdip_buf[7] = tuv110_2; fdip_buf[7] = tuv110_2;
fdip_buf[8] = tuv101_2; fdip_buf[8] = tuv101_2;
fdip_buf[9] = tuv011_2; fdip_buf[9] = tuv011_2;
idx = ii; idx = ii;
for (int m = 0; m < 10; m++) { for (int m = 0; m < 10; m++) {
fdip_phi2[idx] = fdip_buf[m]; fdip_phi2[idx] = fdip_buf[m];
idx += inum; idx += inum;
@ -1824,7 +1824,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
fdip_buf[17] = tuv102; fdip_buf[17] = tuv102;
fdip_buf[18] = tuv012; fdip_buf[18] = tuv012;
fdip_buf[19] = tuv111; fdip_buf[19] = tuv111;
idx = ii; idx = ii;
for (int m = 0; m < 20; m++) { for (int m = 0; m < 20; m++) {
fdip_sum_phi[idx] = fdip_buf[m]; fdip_sum_phi[idx] = fdip_buf[m];
idx += inum; idx += inum;
@ -1855,12 +1855,12 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
if (ii<inum) { if (ii<inum) {
int nlpts = (bsorder-1) / 2; int nlpts = (bsorder-1) / 2;
int istart = fast_mul(ii,4); int istart = fast_mul(ii,4);
int igridx = igrid[istart]; int igridx = igrid[istart];
int igridy = igrid[istart+1]; int igridy = igrid[istart+1];
int igridz = igrid[istart+2]; int igridz = igrid[istart+2];
// now istart is used to index thetai1, thetai2 and thetai3 // now istart is used to index thetai1, thetai2 and thetai3
istart = fast_mul(ii,bsorder); istart = fast_mul(ii,bsorder);
@ -1990,7 +1990,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
buf[18] = tuv012; buf[18] = tuv012;
buf[19] = tuv111; buf[19] = tuv111;
int idx = ii; int idx = ii;
for (int m = 0; m < 20; m++) { for (int m = 0; m < 20; m++) {
fphi[idx] = felec * buf[m]; fphi[idx] = felec * buf[m];
idx += inum; idx += inum;

View File

@ -28,9 +28,9 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int AnswerT::bytes_per_atom() const { int AnswerT::bytes_per_atom() const {
int bytes=11*sizeof(acctyp); int bytes=10*sizeof(acctyp);
if (_rot) if (_rot)
bytes+=4*sizeof(acctyp); bytes+=3*sizeof(acctyp);
if (_charge) if (_charge)
bytes+=sizeof(acctyp); bytes+=sizeof(acctyp);
return bytes; return bytes;
@ -42,9 +42,9 @@ bool AnswerT::alloc(const int inum) {
bool success=true; bool success=true;
_ans_fields=4; _ans_fields=3;
if (_rot) if (_rot)
_ans_fields+=4; _ans_fields+=3;
// --------------------------- Device allocations // --------------------------- Device allocations
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY, success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
@ -134,11 +134,11 @@ void AnswerT::clear() {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
double AnswerT::host_memory_usage() const { double AnswerT::host_memory_usage() const {
int atom_bytes=4; int atom_bytes=3;
if (_charge) if (_charge)
atom_bytes+=1; atom_bytes+=1;
if (_rot) if (_rot)
atom_bytes+=4; atom_bytes+=3;
int ans_bytes=atom_bytes+_ev_fields; int ans_bytes=atom_bytes+_ev_fields;
return ans_bytes*(_max_local)*sizeof(acctyp)+ return ans_bytes*(_max_local)*sizeof(acctyp)+
sizeof(Answer<numtyp,acctyp>); sizeof(Answer<numtyp,acctyp>);
@ -169,9 +169,9 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
if (csize>0) if (csize>0)
engv.update_host(_ev_stride*csize,true); engv.update_host(_ev_stride*csize,true);
if (_rot) if (_rot)
force.update_host(_inum*4*2,true); force.update_host(_inum*3*2,true);
else else
force.update_host(_inum*4,true); force.update_host(_inum*3,true);
time_answer.stop(); time_answer.stop();
#ifndef GERYON_OCL_FLUSH #ifndef GERYON_OCL_FLUSH
@ -298,10 +298,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void AnswerT::get_answers(double **f, double **tor) { void AnswerT::get_answers(double **f, double **tor) {
if (_ilist==nullptr) { if (_ilist==nullptr) {
typedef struct { double x,y,z; } vec3d; auto fp=reinterpret_cast<double*>(&(f[0][0]));
typedef struct { acctyp x,y,z,w; } vec4d_t;
auto fp=reinterpret_cast<vec3d*>(&(f[0][0]));
auto forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
#if (LAL_USE_OMP == 1) #if (LAL_USE_OMP == 1)
#pragma omp parallel #pragma omp parallel
@ -310,27 +307,21 @@ void AnswerT::get_answers(double **f, double **tor) {
#if (LAL_USE_OMP == 1) #if (LAL_USE_OMP == 1)
const int nthreads = omp_get_num_threads(); const int nthreads = omp_get_num_threads();
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
const int idelta = _inum / nthreads + 1; const int idelta = _inum*3 / nthreads + 1;
const int ifrom = tid * idelta; const int ifrom = tid * idelta;
const int ito = std::min(ifrom + idelta, _inum); const int ito = std::min(ifrom + idelta, _inum*3);
#else #else
const int ifrom = 0; const int ifrom = 0;
const int ito = _inum; const int ito = _inum*3;
#endif #endif
for (int i=ifrom; i<ito; i++) { for (int i=ifrom; i<ito; i++)
fp[i].x+=forcep[i].x; fp[i]+=force[i];
fp[i].y+=forcep[i].y;
fp[i].z+=forcep[i].z;
}
if (_rot) { if (_rot) {
auto torp=reinterpret_cast<vec3d*>(&(tor[0][0])); auto torp=reinterpret_cast<double*>(&(tor[0][0]));
auto torquep=reinterpret_cast<vec4d_t*>(&(force[_inum*4])); auto torquep=&(force[_inum*3]);
for (int i=ifrom; i<ito; i++) { for (int i=ifrom; i<ito; i++)
torp[i].x+=torquep[i].x; torp[i]+=torquep[i];
torp[i].y+=torquep[i].y;
torp[i].z+=torquep[i].z;
}
} }
} }
} else { } else {
@ -344,7 +335,7 @@ void AnswerT::get_answers(double **f, double **tor) {
const int idelta = _inum / nthreads + 1; const int idelta = _inum / nthreads + 1;
const int ifrom = tid * idelta; const int ifrom = tid * idelta;
const int ito = std::min(ifrom + idelta, _inum); const int ito = std::min(ifrom + idelta, _inum);
int fl=ifrom*4; int fl=ifrom*3;
#else #else
const int ifrom = 0; const int ifrom = 0;
const int ito = _inum; const int ito = _inum;
@ -356,16 +347,16 @@ void AnswerT::get_answers(double **f, double **tor) {
f[ii][0]+=force[fl]; f[ii][0]+=force[fl];
f[ii][1]+=force[fl+1]; f[ii][1]+=force[fl+1];
f[ii][2]+=force[fl+2]; f[ii][2]+=force[fl+2];
fl+=4; fl+=3;
} }
if (_rot) { if (_rot) {
fl=_inum*4 + ifrom*4; fl=_inum*3 + ifrom*3;
for (int i=ifrom; i<ito; i++) { for (int i=ifrom; i<ito; i++) {
int ii=_ilist[i]; int ii=_ilist[i];
tor[ii][0]+=force[fl]; tor[ii][0]+=force[fl];
tor[ii][1]+=force[fl+1]; tor[ii][1]+=force[fl+1];
tor[ii][2]+=force[fl+2]; tor[ii][2]+=force[fl+2];
fl+=4; fl+=3;
} }
} }
} }

View File

@ -114,7 +114,7 @@ bool AtomT::alloc(const int nall) {
UCL_READ_ONLY)==UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=q.device.row_bytes(); gpu_bytes+=q.device.row_bytes();
} }
if (_rot && !_host_view) { if (_rot) {
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY, success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=quat.device.row_bytes(); gpu_bytes+=quat.device.row_bytes();
@ -182,11 +182,9 @@ bool AtomT::add_fields(const bool charge, const bool rot,
if (rot && !_rot) { if (rot && !_rot) {
_rot=true; _rot=true;
_other=true; _other=true;
if (!_host_view) { success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY, UCL_READ_ONLY)==UCL_SUCCESS);
UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=quat.device.row_bytes();
gpu_bytes+=quat.device.row_bytes();
}
} }
if (vel && !_vel) { if (vel && !_vel) {
@ -451,7 +449,7 @@ template <class numtyp, class acctyp>
void AtomT::compile_kernels(UCL_Device &dev) { void AtomT::compile_kernels(UCL_Device &dev) {
std::string flags = ""; std::string flags = "";
atom_program=new UCL_Program(dev); atom_program=new UCL_Program(dev);
atom_program->load_string(atom,flags,nullptr,screen); atom_program->load_string(atom,flags.c_str(),nullptr,stderr);
k_cast_x.set_function(*atom_program,"kernel_cast_x"); k_cast_x.set_function(*atom_program,"kernel_cast_x");
_compiled=true; _compiled=true;
} }

View File

@ -18,7 +18,7 @@
#endif #endif
__kernel void kernel_cast_x(__global numtyp4 *restrict x_type, __kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
const __global numtyp *restrict x, const __global double *restrict x,
const __global int *restrict type, const __global int *restrict type,
const int nall) { const int nall) {
int ii=GLOBAL_ID_X; int ii=GLOBAL_ID_X;

View File

@ -52,6 +52,12 @@ using namespace ucl_cudadr;
namespace LAMMPS_AL { namespace LAMMPS_AL {
struct EllipsoidBonus {
double shape[3];
double quat[4];
int ilocal;
};
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
class Atom { class Atom {
public: public:
@ -306,8 +312,8 @@ class Atom {
if (_x_avail==false) { if (_x_avail==false) {
double t=MPI_Wtime(); double t=MPI_Wtime();
#ifdef GPU_CAST #ifdef GPU_CAST
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int));
#else #else
vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0])); vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0])); vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0]));
@ -351,6 +357,24 @@ class Atom {
add_x_data(host_ptr,host_type); add_x_data(host_ptr,host_type);
} }
// Cast mu data to write buffer (stored in quat)
template<class cpytyp>
inline void cast_mu_data(cpytyp *host_ptr) {
if (_quat_avail==false) {
double t=MPI_Wtime();
if (sizeof(numtyp)==sizeof(double))
memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
else
#if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
#pragma omp parallel for simd schedule(static)
#elif (LAL_USE_OMP_SIMD == 1)
#pragma omp simd
#endif
for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
_time_cast+=MPI_Wtime()-t;
}
}
// Cast charges to write buffer // Cast charges to write buffer
template<class cpytyp> template<class cpytyp>
inline void cast_q_data(cpytyp *host_ptr) { inline void cast_q_data(cpytyp *host_ptr) {
@ -384,22 +408,24 @@ class Atom {
} }
// Cast quaternions to write buffer // Cast quaternions to write buffer
template<class cpytyp> inline void cast_quat_data(const int *ellipsoid,
inline void cast_quat_data(cpytyp *host_ptr) { const EllipsoidBonus *bonus) {
if (_quat_avail==false) { if (_quat_avail==false) {
double t=MPI_Wtime(); double t=MPI_Wtime();
if (_host_view) { #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
quat.host.view((numtyp*)host_ptr,_nall*4,*dev); #pragma omp parallel for simd schedule(static)
quat.device.view(quat.host); #elif (LAL_USE_OMP_SIMD == 1)
} else if (sizeof(numtyp)==sizeof(double)) #pragma omp simd
memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp)); #endif
else for (int i=0; i<_nall; i++) {
#if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) int qi = ellipsoid[i];
#pragma omp parallel for simd schedule(static) if (qi > -1) {
#elif (LAL_USE_OMP_SIMD == 1) quat[i*4] = bonus[qi].quat[0];
#pragma omp simd quat[i*4+1] = bonus[qi].quat[1];
#endif quat[i*4+2] = bonus[qi].quat[2];
for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i]; quat[i*4+3] = bonus[qi].quat[3];
}
}
_time_cast+=MPI_Wtime()-t; _time_cast+=MPI_Wtime()-t;
} }
} }
@ -419,10 +445,6 @@ class Atom {
inline void cast_v_data(double **host_ptr, const tagint *host_tag) { inline void cast_v_data(double **host_ptr, const tagint *host_tag) {
if (_v_avail==false) { if (_v_avail==false) {
double t=MPI_Wtime(); double t=MPI_Wtime();
#ifdef GPU_CAST
memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
#else
vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0])); vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0])); vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0]));
#if (LAL_USE_OMP == 1) #if (LAL_USE_OMP == 1)
@ -434,7 +456,6 @@ class Atom {
vp[i].z=host_p[i].z; vp[i].z=host_p[i].z;
vp[i].w=host_tag[i]; vp[i].w=host_tag[i];
} }
#endif
_time_cast+=MPI_Wtime()-t; _time_cast+=MPI_Wtime()-t;
} }
} }
@ -444,16 +465,7 @@ class Atom {
inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) { inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) {
time_vel.start(); time_vel.start();
if (_v_avail==false) { if (_v_avail==false) {
#ifdef GPU_CAST
v_cast.update_device(_nall*3,true);
tag_cast.update_device(_nall,true);
int block_size=64;
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
k_cast_x.set_size(GX,block_size);
k_cast_x.run(&v, &v_cast, &tag_cast, &_nall);
#else
v.update_device(_nall*4,true); v.update_device(_nall*4,true);
#endif
_v_avail=true; _v_avail=true;
} }
time_vel.stop(); time_vel.stop();
@ -519,7 +531,7 @@ class Atom {
UCL_Vector<numtyp4,numtyp4> extra; UCL_Vector<numtyp4,numtyp4> extra;
#ifdef GPU_CAST #ifdef GPU_CAST
UCL_Vector<numtyp,numtyp> x_cast; UCL_Vector<double,double> x_cast;
UCL_Vector<int,int> type_cast; UCL_Vector<int,int> type_cast;
#endif #endif

View File

@ -143,10 +143,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE); dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
_max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10); _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
_tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); _tep.alloc(_max_tep_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_max_fieldp_size = _max_tep_size; _max_fieldp_size = _max_tep_size;
_fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); _fieldp.alloc(_max_fieldp_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_max_thetai_size = 0; _max_thetai_size = 0;
@ -387,7 +387,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
if (inum_full>_max_tep_size) { if (inum_full>_max_tep_size) {
_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10); _max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
_tep.resize(_max_tep_size*4); _tep.resize(_max_tep_size*3);
} }
*tep_ptr=_tep.host.begin(); *tep_ptr=_tep.host.begin();
@ -403,7 +403,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
// copy tep from device to host // copy tep from device to host
_tep.update_host(_max_tep_size*4,false); _tep.update_host(_max_tep_size*3,false);
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
@ -429,7 +429,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double
// copy field and fieldp from device to host (_fieldp store both arrays, one after another) // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
_fieldp.update_host(_max_fieldp_size*8,false); _fieldp.update_host(_max_fieldp_size*6,false);
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
@ -456,7 +456,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
// NOTE: move this step to update_fieldp() to delay device-host transfer // NOTE: move this step to update_fieldp() to delay device-host transfer
// after umutual1 and self are done on the GPU // after umutual1 and self are done on the GPU
// *fieldp_ptr=_fieldp.host.begin(); // *fieldp_ptr=_fieldp.host.begin();
// _fieldp.update_host(_max_fieldp_size*8,false); // _fieldp.update_host(_max_fieldp_size*6,false);
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
@ -732,7 +732,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
device->add_ans_object(ans); device->add_ans_object(ans);
// copy tep from device to host // copy tep from device to host
_tep.update_host(_max_tep_size*4,false); _tep.update_host(_max_tep_size*3,false);
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------

View File

@ -233,7 +233,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
atom->cast_x_data(host_x,host_type); atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q); atom->cast_q_data(host_q);
atom->cast_quat_data(host_mu[0]); atom->cast_mu_data(host_mu[0]);
hd_balancer.start_timer(); hd_balancer.start_timer();
atom->add_x_data(host_x,host_type); atom->add_x_data(host_x,host_type);
atom->add_q_data(); atom->add_q_data();
@ -297,12 +297,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
if (!success) if (!success)
return nullptr; return nullptr;
atom->cast_q_data(host_q); atom->cast_q_data(host_q);
atom->cast_quat_data(host_mu[0]); atom->cast_mu_data(host_mu[0]);
hd_balancer.start_timer(); hd_balancer.start_timer();
} else { } else {
atom->cast_x_data(host_x,host_type); atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q); atom->cast_q_data(host_q);
atom->cast_quat_data(host_mu[0]); atom->cast_mu_data(host_mu[0]);
hd_balancer.start_timer(); hd_balancer.start_timer();
atom->add_x_data(host_x,host_type); atom->add_x_data(host_x,host_type);
} }

View File

@ -375,7 +375,8 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
const bool eflag_in, const bool vflag_in, const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, int &host_start, const double cpu_time,
bool &success, double **host_quat) { bool &success, const int *ellipsoid,
const EllipsoidBonus *bonus) {
acc_timers(); acc_timers();
int eflag, vflag; int eflag, vflag;
if (eflag_in) eflag=2; if (eflag_in) eflag=2;
@ -409,7 +410,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
list=ilist; list=ilist;
atom->cast_x_data(host_x,host_type); atom->cast_x_data(host_x,host_type);
atom->cast_quat_data(host_quat[0]); atom->cast_quat_data(ellipsoid,bonus);
hd_balancer.start_timer(); hd_balancer.start_timer();
atom->add_x_data(host_x,host_type); atom->add_x_data(host_x,host_type);
atom->add_quat_data(); atom->add_quat_data();
@ -433,7 +434,8 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full,
const bool eatom, const bool vatom, const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum, int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success, const double cpu_time, bool &success,
double **host_quat) { const int *ellipsoid,
const EllipsoidBonus *bonus) {
acc_timers(); acc_timers();
int eflag, vflag; int eflag, vflag;
if (eflag_in) eflag=2; if (eflag_in) eflag=2;
@ -460,11 +462,11 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full,
sublo, subhi, tag, nspecial, special, success); sublo, subhi, tag, nspecial, special, success);
if (!success) if (!success)
return nullptr; return nullptr;
atom->cast_quat_data(host_quat[0]); atom->cast_quat_data(ellipsoid,bonus);
hd_balancer.start_timer(); hd_balancer.start_timer();
} else { } else {
atom->cast_x_data(host_x,host_type); atom->cast_x_data(host_x,host_type);
atom->cast_quat_data(host_quat[0]); atom->cast_quat_data(ellipsoid,bonus);
hd_balancer.start_timer(); hd_balancer.start_timer();
atom->add_x_data(host_x,host_type); atom->add_x_data(host_x,host_type);
} }

View File

@ -170,7 +170,8 @@ class BaseEllipsoid {
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double **quat); const double cpu_time, bool &success,
const int *ellipsoid, const EllipsoidBonus *bonus);
/// Pair loop with device neighboring /// Pair loop with device neighboring
int**compute(const int ago, const int inum_full, const int nall, int**compute(const int ago, const int inum_full, const int nall,
@ -179,7 +180,7 @@ class BaseEllipsoid {
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success, int **ilist, int **numj, const double cpu_time, bool &success,
double **host_quat); const int *ellipsoid, const EllipsoidBonus *bonus);
// -------------------------- DEVICE DATA ------------------------- // -------------------------- DEVICE DATA -------------------------

View File

@ -31,7 +31,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -130,7 +131,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -150,7 +151,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
beck2[tid]=beck2_in[tid]; beck2[tid]=beck2_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -172,6 +173,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -32,7 +32,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -48,7 +48,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -67,6 +67,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -123,7 +124,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -144,7 +145,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -166,6 +167,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -36,7 +36,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -60,7 +60,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -80,6 +80,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -158,7 +159,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -183,7 +184,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -206,6 +207,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -51,7 +51,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -75,7 +75,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -95,6 +95,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -192,7 +193,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -217,7 +218,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -240,6 +241,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -38,7 +38,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -63,7 +63,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -89,6 +89,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
} }
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -174,7 +175,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -200,7 +201,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -229,6 +230,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
} }
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -39,7 +39,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -64,7 +64,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -90,6 +90,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
} }
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -176,7 +177,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -202,7 +203,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -231,6 +232,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
} }
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -31,7 +31,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -120,7 +121,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -141,7 +142,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -163,6 +164,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -36,7 +36,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -151,7 +152,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -177,7 +178,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -200,6 +201,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -36,7 +36,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -60,7 +60,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -80,6 +80,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -159,7 +160,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -185,7 +186,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -208,6 +209,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -34,7 +34,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj, const __global numtyp *restrict sp_lj,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -53,7 +53,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_bio(); local_allocate_store_bio();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -73,6 +73,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -159,7 +160,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -187,7 +188,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES) if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR]; ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -209,6 +210,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -35,7 +35,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj, const __global numtyp *restrict sp_lj,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -50,7 +50,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_bio(); local_allocate_store_bio();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -70,6 +70,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -156,7 +157,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -181,7 +182,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES) if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR]; ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -203,6 +204,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -34,7 +34,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
const __global int *form, const __global int *form,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -50,7 +50,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -69,6 +69,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -188,7 +189,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
const __global int *form_in, const __global int *form_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -215,7 +216,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -237,6 +238,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -35,7 +35,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_cl_in, const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -54,7 +54,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
sp_cl[2]=sp_cl_in[2]; sp_cl[2]=sp_cl_in[2];
sp_cl[3]=sp_cl_in[3]; sp_cl[3]=sp_cl_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -74,6 +74,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
numtyp factor_coul; numtyp factor_coul;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_coul = sp_cl[sbmask(j)]; factor_coul = sp_cl[sbmask(j)];
@ -125,7 +126,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_cl_in, const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -146,7 +147,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
cutsq[tid]=_cutsq[tid]; cutsq[tid]=_cutsq[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -169,6 +170,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
numtyp factor_coul; numtyp factor_coul;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_coul = sp_cl[sbmask(j)]; factor_coul = sp_cl[sbmask(j)];

View File

@ -35,7 +35,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_cl_in, const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -55,7 +55,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
sp_cl[2]=sp_cl_in[2]; sp_cl[2]=sp_cl_in[2];
sp_cl[3]=sp_cl_in[3]; sp_cl[3]=sp_cl_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -75,6 +75,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
numtyp factor_coul; numtyp factor_coul;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_coul = sp_cl[sbmask(j)]; factor_coul = sp_cl[sbmask(j)];
@ -129,7 +130,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_cl_in, const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -153,7 +154,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
cutsq[tid]=_cutsq[tid]; cutsq[tid]=_cutsq[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -176,6 +177,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
numtyp factor_coul; numtyp factor_coul;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_coul = sp_cl[sbmask(j)]; factor_coul = sp_cl[sbmask(j)];

View File

@ -36,7 +36,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -56,7 +56,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -81,6 +81,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
} }
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_coul, r, prefactor, erfcc; numtyp factor_coul, r, prefactor, erfcc;
@ -138,7 +139,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -156,7 +157,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
if (tid<4) if (tid<4)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -183,6 +184,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
} }
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_coul, r, prefactor, erfcc; numtyp factor_coul, r, prefactor, erfcc;

View File

@ -35,7 +35,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_cl_in, const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -54,7 +54,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
sp_cl[2]=sp_cl_in[2]; sp_cl[2]=sp_cl_in[2];
sp_cl[3]=sp_cl_in[3]; sp_cl[3]=sp_cl_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp e_coul, virial[6]; acctyp e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -73,6 +73,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_coul; numtyp factor_coul;
@ -132,7 +133,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_cl_in, const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -152,7 +153,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
scale[tid]=scale_in[tid]; scale[tid]=scale_in[tid];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp e_coul, virial[6]; acctyp e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -174,6 +175,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_coul; numtyp factor_coul;

View File

@ -49,7 +49,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_cl_in, const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -68,7 +68,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
sp_cl[2]=sp_cl_in[2]; sp_cl[2]=sp_cl_in[2];
sp_cl[3]=sp_cl_in[3]; sp_cl[3]=sp_cl_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp e_coul, virial[6]; acctyp e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -87,6 +87,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_coul; numtyp factor_coul;
@ -166,7 +167,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_cl_in, const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -186,7 +187,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
scale[tid]=scale_in[tid]; scale[tid]=scale_in[tid];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp e_coul, virial[6]; acctyp e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -208,6 +209,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_coul; numtyp factor_coul;

View File

@ -370,7 +370,7 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
_ocl_config_name="CUSTOM"; _ocl_config_name="CUSTOM";
int token_count=0; int token_count=0;
std::string params[18]; std::string params[19];
char ocl_config[2048]; char ocl_config[2048];
strncpy(ocl_config,s_config.c_str(),2047); strncpy(ocl_config,s_config.c_str(),2047);
char *pch = strtok(ocl_config,","); char *pch = strtok(ocl_config,",");
@ -378,7 +378,7 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
pch = strtok(nullptr,","); pch = strtok(nullptr,",");
if (pch == nullptr) return -11; if (pch == nullptr) return -11;
while (pch != nullptr) { while (pch != nullptr) {
if (token_count==18) if (token_count==19)
return -11; return -11;
params[token_count]=pch; params[token_count]=pch;
token_count++; token_count++;
@ -389,6 +389,16 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
#ifdef CL_VERSION_2_0 #ifdef CL_VERSION_2_0
_ocl_compile_string+="-cl-std=CL2.0 "; _ocl_compile_string+="-cl-std=CL2.0 ";
#endif #endif
if (params[0]=="500") {
_ocl_compile_string+="-DINTEL_OCL ";
#ifdef _DOUBLE_DOUBLE
// workaround for double precision with Intel OpenCL
params[4]="0";
#endif
}
#ifdef LAL_DISABLE_PREFETCH
params[18]="0";
#endif
if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math "; if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
_ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+ _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
std::string(OCL_PRECISION_COMPILE); std::string(OCL_PRECISION_COMPILE);
@ -421,7 +431,8 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
" -DMAX_SHARED_TYPES="+params[15]+ " -DMAX_SHARED_TYPES="+params[15]+
" -DMAX_BIO_SHARED_TYPES="+params[16]+ " -DMAX_BIO_SHARED_TYPES="+params[16]+
" -DPPPM_MAX_SPLINE="+params[17]; " -DPPPM_MAX_SPLINE="+params[17]+
" -DNBOR_PREFETCH="+params[18];
_ocl_compile_string += extra_args; _ocl_compile_string += extra_args;
#endif #endif
return 0; return 0;
@ -558,7 +569,11 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
return -3; return -3;
if (_user_cell_size<0.0) { if (_user_cell_size<0.0) {
#ifndef LAL_USE_OLD_NEIGHBOR
_neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size());
#else
_neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size()); _neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size());
#endif
} else } else
_neighbor_shared.setup_auto_cell_size(false,_user_cell_size,nbor->simd_size()); _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,nbor->simd_size());
nbor->set_cutoff(cutoff); nbor->set_cutoff(cutoff);
@ -954,7 +969,7 @@ int DeviceT::compile_kernels() {
k_info.set_function(*dev_program,"kernel_info"); k_info.set_function(*dev_program,"kernel_info");
_compiled=true; _compiled=true;
UCL_Vector<int,int> gpu_lib_data(19,*gpu,UCL_NOT_PINNED); UCL_Vector<int,int> gpu_lib_data(20,*gpu,UCL_NOT_PINNED);
k_info.set_size(1,1); k_info.set_size(1,1);
k_info.run(&gpu_lib_data); k_info.run(&gpu_lib_data);
gpu_lib_data.update_host(false); gpu_lib_data.update_host(false);

View File

@ -52,4 +52,5 @@ __kernel void kernel_info(__global int *info) {
info[16]=MAX_SHARED_TYPES; info[16]=MAX_SHARED_TYPES;
info[17]=MAX_BIO_SHARED_TYPES; info[17]=MAX_BIO_SHARED_TYPES;
info[18]=PPPM_MAX_SPLINE; info[18]=PPPM_MAX_SPLINE;
info[19]=NBOR_PREFETCH;
} }

View File

@ -211,7 +211,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -235,7 +235,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f, tor; acctyp3 f, tor;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
@ -257,6 +257,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -282,8 +283,8 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
numtyp rinv, r3inv, r5inv, r7inv; numtyp rinv, r3inv, r5inv, r7inv;
numtyp pre1, pre2, pre3, pre4; numtyp pre1, pre2, pre3, pre4;
numtyp pdotp, pidotr, pjdotr; numtyp pdotp, pidotr, pjdotr;
acctyp4 forcecoul, ticoul; acctyp3 forcecoul, ticoul;
acctyp4 force; acctyp3 force;
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0; forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0; ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
@ -418,7 +419,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -445,7 +446,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f, tor; acctyp3 f, tor;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
@ -470,6 +471,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -494,8 +496,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
numtyp rinv, r3inv, r5inv, r7inv; numtyp rinv, r3inv, r5inv, r7inv;
numtyp pre1, pre2, pre3, pre4; numtyp pre1, pre2, pre3, pre4;
numtyp pdotp, pidotr, pjdotr; numtyp pdotp, pidotr, pjdotr;
acctyp4 forcecoul, ticoul; acctyp3 forcecoul, ticoul;
acctyp4 force; acctyp3 force;
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0; forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0; ticoul.x = ticoul.y = ticoul.z = (acctyp)0;

View File

@ -212,7 +212,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -236,7 +236,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f, tor; acctyp3 f, tor;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
@ -258,6 +258,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -286,8 +287,8 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv; numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
numtyp4 aforcecoul, bforcecoul; numtyp4 aforcecoul, bforcecoul;
acctyp4 forcecoul, ticoul; acctyp3 forcecoul, ticoul;
acctyp4 force; acctyp3 force;
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0; forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0; ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
@ -450,7 +451,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -478,7 +479,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f, tor; acctyp3 f, tor;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
@ -503,6 +504,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -530,8 +532,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv; numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
numtyp4 aforcecoul, bforcecoul; numtyp4 aforcecoul, bforcecoul;
acctyp4 forcecoul, ticoul; acctyp3 forcecoul, ticoul;
acctyp4 force; acctyp3 force;
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0; forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0; ticoul.x = ticoul.y = ticoul.z = (acctyp)0;

View File

@ -213,7 +213,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -238,7 +238,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f, tor; acctyp3 f, tor;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
@ -264,6 +264,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -291,8 +292,8 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz; numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz;
numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1; numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1;
numtyp fdx,fdy,fdz,fax,fay,faz; numtyp fdx,fdy,fdz,fax,fay,faz;
acctyp4 forcecoul, ticoul; acctyp3 forcecoul, ticoul;
acctyp4 force; acctyp3 force;
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0; forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0; ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
@ -462,7 +463,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -490,7 +491,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f, tor; acctyp3 f, tor;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
@ -519,6 +520,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -545,8 +547,8 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz; numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz;
numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1; numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1;
numtyp fdx,fdy,fdz,fax,fay,faz; numtyp fdx,fdy,fdz,fax,fay,faz;
acctyp4 forcecoul, ticoul; acctyp3 forcecoul, ticoul;
acctyp4 force; acctyp3 force;
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0; forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0; ticoul.x = ticoul.y = ticoul.z = (acctyp)0;

View File

@ -168,7 +168,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_sqrt, const __global numtyp *restrict sp_sqrt,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -183,7 +183,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_pair(); local_allocate_store_pair();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -203,6 +203,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
numtyp factor_dpd, factor_sqrt; numtyp factor_dpd, factor_sqrt;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_dpd = sp_lj[sbmask(j)]; factor_dpd = sp_lj[sbmask(j)];
@ -284,7 +285,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_sqrt_in, const __global numtyp *restrict sp_sqrt_in,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -318,7 +319,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_pair(); local_allocate_store_pair();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -343,6 +344,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
numtyp factor_dpd, factor_sqrt; numtyp factor_dpd, factor_sqrt;
#endif #endif
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
#ifndef ONETYPE #ifndef ONETYPE

View File

@ -246,6 +246,7 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
tfrho=type2frho[itype]; tfrho=type2frho[itype];
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -332,6 +333,7 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
#endif #endif
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -376,7 +378,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
const __global numtyp *cutsq, const __global numtyp *cutsq,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *ans, __global acctyp3 *ans,
__global acctyp *engv, __global acctyp *engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int ntypes, const int nbor_pitch, const int ntypes,
@ -388,7 +390,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_answers_eam(); local_allocate_store_answers_eam();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -407,6 +409,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -487,7 +490,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
const __global numtyp *cutsq, const __global numtyp *cutsq,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *ans, __global acctyp3 *ans,
__global acctyp *engv, __global acctyp *engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const numtyp cutforcesq, const int nbor_pitch, const numtyp cutforcesq,
@ -510,7 +513,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
int n_stride; int n_stride;
local_allocate_store_answers_eam(); local_allocate_store_answers_eam();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -532,6 +535,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
#endif #endif
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
j &= NEIGHMASK; j &= NEIGHMASK;

View File

@ -152,7 +152,7 @@ _texture_2d( quat_tex,int4);
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -224,7 +224,7 @@ _texture_2d( quat_tex,int4);
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \

View File

@ -30,7 +30,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj, const __global numtyp *restrict sp_lj,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -40,7 +40,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_pair(); local_allocate_store_pair();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -59,6 +59,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -109,7 +110,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -127,7 +128,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
gauss1[tid]=gauss1_in[tid]; gauss1[tid]=gauss1_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -149,6 +150,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -80,6 +80,9 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
} }
#ifdef INTEL_OCL
__attribute__((intel_reqd_sub_group_size(16)))
#endif
__kernel void k_gayberne(const __global numtyp4 *restrict x_, __kernel void k_gayberne(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict q, const __global numtyp4 *restrict q,
const __global numtyp4 *restrict shape, const __global numtyp4 *restrict shape,
@ -90,7 +93,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
const __global numtyp *restrict lshape, const __global numtyp *restrict lshape,
const __global int *dev_nbor, const __global int *dev_nbor,
const int stride, const int stride,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
const int astride, const int astride,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global int *restrict err_flag, __global int *restrict err_flag,
@ -108,7 +111,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
sp_lj[2]=gum[5]; sp_lj[2]=gum[5];
sp_lj[3]=gum[6]; sp_lj[3]=gum[6];
acctyp4 f, tor; acctyp3 f, tor;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
@ -138,6 +141,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_nbor+nbor+n_stride);
int j=dev_nbor[nbor]; int j=dev_nbor[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;

View File

@ -108,28 +108,33 @@ int** compute(const int ago, const int inum_full, const int nall,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success, int **ilist, int **numj, const double cpu_time, bool &success,
double **host_quat); const int *ellipsoid, const EllipsoidBonus *bonus);
int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall, int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, tagint **special, double *subhi, tagint *tag, int **nspecial,
const bool eflag, const bool vflag, const bool eatom, tagint **special, const bool eflag, const bool vflag,
const bool vatom, int &host_start, int **ilist, const bool eatom, const bool vatom, int &host_start,
int **jnum, const double cpu_time, bool &success, int **ilist, int **jnum, const double cpu_time,
double **host_quat) { bool &success, const int *ellipsoid,
const void *bonus) {
return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
tag, nspecial, special, eflag, vflag, eatom, vatom, tag, nspecial, special, eflag, vflag, eatom, vatom,
host_start, ilist, jnum, cpu_time, success, host_quat); host_start, ilist, jnum, cpu_time, success,
ellipsoid,
static_cast<const EllipsoidBonus *>(bonus));
} }
int * gb_gpu_compute(const int ago, const int inum_full, const int nall, int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double **host_quat) { const double cpu_time, bool &success,
const int *ellipsoid, const void *bonus) {
return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist, return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
numj, firstneigh, eflag, vflag, eatom, vatom, host_start, numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
cpu_time, success, host_quat); cpu_time, success, ellipsoid,
static_cast<const EllipsoidBonus *>(bonus));
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------

View File

@ -34,7 +34,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
const __global numtyp *restrict lshape, const __global numtyp *restrict lshape,
const __global int *dev_nbor, const __global int *dev_nbor,
const int stride, const int stride,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global int *restrict err_flag, __global int *restrict err_flag,
const int eflag, const int vflag, const int eflag, const int vflag,
@ -53,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
sp_lj[2]=gum[5]; sp_lj[2]=gum[5];
sp_lj[3]=gum[6]; sp_lj[3]=gum[6];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -75,6 +75,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_nbor+nbor+n_stride);
int j=dev_nbor[nbor]; int j=dev_nbor[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -259,7 +260,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
const __global numtyp *restrict gum, const __global numtyp *restrict gum,
const int stride, const int stride,
const __global int *dev_ij, const __global int *dev_ij,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global int *restrict err_flag, __global int *restrict err_flag,
const int eflag, const int vflag, const int start, const int eflag, const int vflag, const int start,
@ -277,7 +278,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
sp_lj[2]=gum[5]; sp_lj[2]=gum[5];
sp_lj[3]=gum[6]; sp_lj[3]=gum[6];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -296,6 +297,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_ij+nbor+n_stride);
int j=dev_ij[nbor]; int j=dev_ij[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -347,7 +349,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict gum, const __global numtyp *restrict gum,
const int stride, const int stride,
const __global int *dev_ij, const __global int *dev_ij,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global int *restrict err_flag, __global int *restrict err_flag,
const int eflag, const int vflag, const int eflag, const int vflag,
@ -371,7 +373,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -393,6 +395,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_ij+nbor+n_stride);
int j=dev_ij[nbor]; int j=dev_ij[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -113,7 +113,7 @@ _texture( q_tex,int2);
dufld[5]=red_acc[5][tid]; \ dufld[5]=red_acc[5][tid]; \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 t; \ acctyp3 t; \
t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \ t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \
(numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \ (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \
t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \ t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \
@ -147,7 +147,7 @@ _texture( q_tex,int2);
_fieldp[5]=red_acc[5][tid]; \ _fieldp[5]=red_acc[5][tid]; \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 f, fp; \ acctyp3 f, fp; \
f.x = _fieldp[0]; \ f.x = _fieldp[0]; \
f.y = _fieldp[1]; \ f.y = _fieldp[1]; \
f.z = _fieldp[2]; \ f.z = _fieldp[2]; \
@ -174,7 +174,7 @@ _texture( q_tex,int2);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -254,7 +254,7 @@ _texture( q_tex,int2);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 t; \ acctyp3 t; \
t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \ t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \
(numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \ (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \
t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \ t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \
@ -277,7 +277,7 @@ _texture( q_tex,int2);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 f, fp; \ acctyp3 f, fp; \
f.x = _fieldp[0]; \ f.x = _fieldp[0]; \
f.y = _fieldp[1]; \ f.y = _fieldp[1]; \
f.z = _fieldp[2]; \ f.z = _fieldp[2]; \
@ -302,7 +302,7 @@ _texture( q_tex,int2);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -391,7 +391,7 @@ _texture( q_tex,int2);
if (t_per_atom>1) \ if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -416,9 +416,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
const __global int *dev_short_nbor, const __global int *dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global acctyp4 *restrict tep, __global acctyp3 *restrict tep,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch, const int nall, const int nbor_pitch,
const int t_per_atom, const numtyp aewald, const int t_per_atom, const numtyp aewald,
@ -432,7 +432,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_charge(); local_allocate_store_charge();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -441,7 +441,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
for (int l=0; l<6; l++) virial[l]=(acctyp)0; for (int l=0; l<6; l++) virial[l]=(acctyp)0;
} }
acctyp4 tq; acctyp3 tq;
tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
const __global numtyp4* polar1 = &extra[0]; const __global numtyp4* polar1 = &extra[0];
@ -634,7 +634,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
frcx = sizik * frcx; frcx = sizik * frcx;
frcy = sizik * frcy; frcy = sizik * frcy;
frcz = sizik * frcz; frcz = sizik * frcz;
// compute the torque components for this interaction // compute the torque components for this interaction
numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) - numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) -
@ -717,7 +717,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
const __global int *dev_short_nbor, const __global int *dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch, const int nall, const int nbor_pitch,
@ -730,7 +730,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_charge(); local_allocate_store_charge();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -895,9 +895,9 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
const __global int *dev_short_nbor, const __global int *dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global acctyp4 *restrict tep, __global acctyp3 *restrict tep,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch, const int nall, const int nbor_pitch,
const int t_per_atom, const numtyp aewald, const int t_per_atom, const numtyp aewald,
@ -910,7 +910,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_charge(); local_allocate_store_charge();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -919,7 +919,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
for (int l=0; l<6; l++) virial[l]=(acctyp)0; for (int l=0; l<6; l++) virial[l]=(acctyp)0;
} }
acctyp4 tq; acctyp3 tq;
tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
const __global numtyp4* polar1 = &extra[0]; const __global numtyp4* polar1 = &extra[0];
@ -1210,7 +1210,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
const __global int *dev_short_nbor, const __global int *dev_short_nbor,
__global acctyp4 *restrict fieldp, __global acctyp3 *restrict fieldp,
const int inum, const int nall, const int inum, const int nall,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
const numtyp aewald, const numtyp off2, const numtyp aewald, const numtyp off2,
@ -1390,7 +1390,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
const __global int *dev_short_nbor, const __global int *dev_short_nbor,
__global acctyp4 *restrict fieldp, __global acctyp3 *restrict fieldp,
const int inum, const int nall, const int inum, const int nall,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
const numtyp aewald, const numtyp off2, const numtyp aewald, const numtyp off2,
@ -1541,9 +1541,9 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
const __global int *dev_short_nbor, const __global int *dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global acctyp4 *restrict tep, __global acctyp3 *restrict tep,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch, const int t_per_atom, const int nall, const int nbor_pitch, const int t_per_atom,
const numtyp aewald, const numtyp felec, const numtyp aewald, const numtyp felec,
@ -1556,7 +1556,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_charge(); local_allocate_store_charge();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -1697,7 +1697,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
numtyp rr9 = (numtyp)7.0 * rr7 * r2inv; numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
// calculate the real space Ewald error function terms // calculate the real space Ewald error function terms
int m; int m;
numtyp ralpha = aewald * r; numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp exp2a = ucl_exp(-ralpha*ralpha);
@ -2003,12 +2003,12 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
if (ii<inum) { if (ii<inum) {
const int nlpts = (bsorder-1) / 2; const int nlpts = (bsorder-1) / 2;
int istart = fast_mul(ii,4); int istart = fast_mul(ii,4);
const int igridx = igrid[istart]; const int igridx = igrid[istart];
const int igridy = igrid[istart+1]; const int igridy = igrid[istart+1];
const int igridz = igrid[istart+2]; const int igridz = igrid[istart+2];
// now istart is used to index thetai1, thetai2 and thetai3 // now istart is used to index thetai1, thetai2 and thetai3
istart = fast_mul(ii,bsorder); istart = fast_mul(ii,bsorder);
@ -2202,7 +2202,7 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
fdip_buf[7] = tuv110_1; fdip_buf[7] = tuv110_1;
fdip_buf[8] = tuv101_1; fdip_buf[8] = tuv101_1;
fdip_buf[9] = tuv011_1; fdip_buf[9] = tuv011_1;
idx = ii; idx = ii;
for (int m = 0; m < 10; m++) { for (int m = 0; m < 10; m++) {
fdip_phi1[idx] = fdip_buf[m]; fdip_phi1[idx] = fdip_buf[m];
idx += inum; idx += inum;
@ -2218,7 +2218,7 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
fdip_buf[7] = tuv110_2; fdip_buf[7] = tuv110_2;
fdip_buf[8] = tuv101_2; fdip_buf[8] = tuv101_2;
fdip_buf[9] = tuv011_2; fdip_buf[9] = tuv011_2;
idx = ii; idx = ii;
for (int m = 0; m < 10; m++) { for (int m = 0; m < 10; m++) {
fdip_phi2[idx] = fdip_buf[m]; fdip_phi2[idx] = fdip_buf[m];
idx += inum; idx += inum;
@ -2244,7 +2244,7 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
fdip_buf[17] = tuv102; fdip_buf[17] = tuv102;
fdip_buf[18] = tuv012; fdip_buf[18] = tuv012;
fdip_buf[19] = tuv111; fdip_buf[19] = tuv111;
idx = ii; idx = ii;
for (int m = 0; m < 20; m++) { for (int m = 0; m < 20; m++) {
fdip_sum_phi[idx] = fdip_buf[m]; fdip_sum_phi[idx] = fdip_buf[m];
idx += inum; idx += inum;
@ -2275,12 +2275,12 @@ __kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1,
if (ii<inum) { if (ii<inum) {
int nlpts = (bsorder-1) / 2; int nlpts = (bsorder-1) / 2;
int istart = fast_mul(ii,4); int istart = fast_mul(ii,4);
int igridx = igrid[istart]; int igridx = igrid[istart];
int igridy = igrid[istart+1]; int igridy = igrid[istart+1];
int igridz = igrid[istart+2]; int igridz = igrid[istart+2];
// now istart is used to index thetai1, thetai2 and thetai3 // now istart is used to index thetai1, thetai2 and thetai3
istart = fast_mul(ii,bsorder); istart = fast_mul(ii,bsorder);
@ -2410,7 +2410,7 @@ __kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1,
buf[18] = tuv012; buf[18] = tuv012;
buf[19] = tuv111; buf[19] = tuv111;
int idx = ii; int idx = ii;
for (int m = 0; m < 20; m++) { for (int m = 0; m < 20; m++) {
fphi[idx] = buf[m]; fphi[idx] = buf[m];
idx += inum; idx += inum;

View File

@ -31,7 +31,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj, const __global numtyp *restrict sp_lj,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -41,7 +41,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_pair(); local_allocate_store_pair();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -59,6 +59,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -110,7 +111,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -144,7 +145,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_pair(); local_allocate_store_pair();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -166,6 +167,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
NOUNROLL NOUNROLL
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
#ifndef ONETYPE #ifndef ONETYPE
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -31,7 +31,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -118,7 +119,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -139,7 +140,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -161,6 +162,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -36,7 +36,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -156,7 +157,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -182,7 +183,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -205,6 +206,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -36,7 +36,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -147,7 +148,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -173,7 +174,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -196,6 +197,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -36,7 +36,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -60,7 +60,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -80,6 +80,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -154,7 +155,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -181,7 +182,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -204,6 +205,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -36,7 +36,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -154,7 +155,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -178,7 +179,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -201,6 +202,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -94,7 +94,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -117,7 +117,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -139,6 +139,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
numtyp cut_coul = ucl_sqrt(cut_coulsq); numtyp cut_coul = ucl_sqrt(cut_coulsq);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -215,7 +216,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -239,7 +240,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -264,6 +265,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
numtyp cut_coul = ucl_sqrt(cut_coulsq); numtyp cut_coul = ucl_sqrt(cut_coulsq);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -39,7 +39,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj, const __global numtyp *restrict sp_lj,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -49,7 +49,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_pair(); local_allocate_store_pair();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -67,6 +67,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -132,7 +133,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -155,7 +156,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -176,6 +177,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -38,7 +38,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -62,7 +62,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -88,6 +88,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
} }
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul, r, prefactor, erfcc; numtyp factor_lj, factor_coul, r, prefactor, erfcc;
@ -165,7 +166,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -190,7 +191,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -219,6 +220,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
} }
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul, r, prefactor, erfcc; numtyp factor_lj, factor_coul, r, prefactor, erfcc;

View File

@ -33,7 +33,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -49,7 +49,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -68,6 +68,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -122,7 +123,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -143,7 +144,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -165,6 +166,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -36,7 +36,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -158,7 +159,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -181,7 +182,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -204,6 +205,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -34,7 +34,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -50,7 +50,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -68,6 +68,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -134,7 +135,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -156,7 +157,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
ljsw[tid]=ljsw_in[tid]; ljsw[tid]=ljsw_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -177,6 +178,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -33,7 +33,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj, const __global numtyp *restrict sp_lj,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -43,7 +43,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_pair(); local_allocate_store_pair();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -61,8 +61,9 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
numtyp force, r6inv, factor_lj, forcelj; numtyp force, r6inv, factor_lj, forcelj;
numtyp r, t, tsq, fskin; numtyp r, t, tsq, fskin;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -76,10 +77,10 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype; int mtype=itype*lj_types+jtype;
if (rsq<lj1[mtype].z) { if (rsq<lj1[mtype].z) {
numtyp r2inv=ucl_recip(rsq); numtyp r2inv=ucl_recip(rsq);
if (rsq < lj1[mtype].w) { if (rsq < lj1[mtype].w) {
r6inv = r2inv*r2inv*r2inv; r6inv = r2inv*r2inv*r2inv;
@ -135,7 +136,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -169,7 +170,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_pair(); local_allocate_store_pair();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -194,6 +195,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
NOUNROLL NOUNROLL
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
#ifndef ONETYPE #ifndef ONETYPE
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -236,7 +238,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
if (rsq < lj1[mtype].w) if (rsq < lj1[mtype].w)
e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z; e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z;
else else
e = ljsw0[mtype].x - ljsw[mtype].x*t - e = ljsw0[mtype].x - ljsw[mtype].x*t -
ljsw[mtype].y*tsq/2.0 - ljsw[mtype].z*tsq*t/3.0 - ljsw[mtype].y*tsq/2.0 - ljsw[mtype].z*tsq*t/3.0 -
ljsw[mtype].w*tsq*tsq/4.0 - lj3[mtype].z; //??? ljsw[mtype].w*tsq*tsq/4.0 - lj3[mtype].z; //???

View File

@ -31,7 +31,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -128,7 +129,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -149,7 +150,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -171,6 +172,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -36,7 +36,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
@ -166,7 +167,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -189,7 +190,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6]; acctyp energy, e_coul, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -212,6 +213,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;

View File

@ -59,7 +59,7 @@ _texture( q_tex,int2);
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
GPU analogue of Atom::map inline method, GPU analogue of Atom::map inline method,
but now limited to map_array mapping style. but now limited to map_array mapping style.
Map global ID to local index of atom. Map global ID to local index of atom.
---------------------------------------------------------------------- */ ---------------------------------------------------------------------- */
ucl_inline int atom_mapping(const __global int *map, tagint glob) { ucl_inline int atom_mapping(const __global int *map, tagint glob) {
@ -134,16 +134,16 @@ ucl_inline void compute_newsite(int iO, int iH1, int iH2,
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
Compute resulting forces (ans), energies and virial (engv). Compute resulting forces (ans), energies and virial (engv).
An additional term is calculated based on the previously An additional term is calculated based on the previously
calculated values on the virlual sites (ansO), calculated values on the virlual sites (ansO),
which should be distributed over the real atoms. which should be distributed over the real atoms.
For some hydrogens, the corresponding oxygens are For some hydrogens, the corresponding oxygens are
not local atoms and the ansO value is not calculated. not local atoms and the ansO value is not calculated.
The required increase is calculated directly in the main function. The required increase is calculated directly in the main function.
---------------------------------------------------------------------- */ ---------------------------------------------------------------------- */
__kernel void k_lj_tip4p_long_distrib( __kernel void k_lj_tip4p_long_distrib(
const __global numtyp4 *restrict x_, const __global numtyp4 *restrict x_,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
@ -151,11 +151,11 @@ __kernel void k_lj_tip4p_long_distrib(
const __global numtyp4 *restrict m, const __global numtyp4 *restrict m,
const int typeO, const int typeH, const int typeO, const int typeH,
const numtyp alpha, const numtyp alpha,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const __global acctyp4 *restrict ansO) { const __global acctyp4 *restrict ansO) {
int i = BLOCK_ID_X*(BLOCK_SIZE_X)+THREAD_ID_X; int i = BLOCK_ID_X*(BLOCK_SIZE_X)+THREAD_ID_X;
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
if (i<inum) { if (i<inum) {
@ -208,7 +208,7 @@ __kernel void k_lj_tip4p_long_distrib(
engv[inum*engv_iter + i] += vM.z * (acctyp)(1 - alpha); engv[inum*engv_iter + i] += vM.z * (acctyp)(1 - alpha);
} }
} }
acctyp4 old=ans[i]; acctyp3 old=ans[i];
old.x+=f.x; old.x+=f.x;
old.y+=f.y; old.y+=f.y;
old.z+=f.z; old.z+=f.z;
@ -219,7 +219,7 @@ __kernel void k_lj_tip4p_long_distrib(
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
Rebuild hneigh after the neighbor build. Rebuild hneigh after the neighbor build.
hneight stores local IDs of H1 and H2 for each local and ghost O hneight stores local IDs of H1 and H2 for each local and ghost O
and local ID of O for each local H. and local ID of O for each local H.
---------------------------------------------------------------------- */ ---------------------------------------------------------------------- */
__kernel void k_lj_tip4p_reneigh( __kernel void k_lj_tip4p_reneigh(
const __global numtyp4 *restrict x_, const __global numtyp4 *restrict x_,
@ -230,7 +230,7 @@ __kernel void k_lj_tip4p_reneigh(
__global int *restrict hneigh, __global int *restrict hneigh,
__global numtyp4 *restrict m, __global numtyp4 *restrict m,
const int typeO, const int typeH, const int typeO, const int typeH,
const __global tagint *restrict tag, const __global tagint *restrict tag,
const __global int *restrict map, const __global int *restrict map,
const __global int *restrict sametag) { const __global int *restrict sametag) {
@ -298,7 +298,7 @@ __kernel void k_lj_tip4p_newsite(const __global numtyp4 *restrict x_,
iO = i; iO = i;
numtyp qO; fetch(qO,iO,q_tex); numtyp qO; fetch(qO,iO,q_tex);
if (iH1>=0 && iH2>=0) { if (iH1>=0 && iH2>=0) {
compute_newsite(iO,iH1,iH2, &m[iO], qO, alpha, x_); compute_newsite(iO,iH1,iH2, &m[iO], qO, alpha, x_);
} else { } else {
m[iO] = ix; m[iO] = ix;
m[iO].w = qO; m[iO].w = qO;
@ -313,9 +313,9 @@ __kernel void k_lj_tip4p_newsite(const __global numtyp4 *restrict x_,
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
Compute initial value of force, energy and virial for each local particle. Compute initial value of force, energy and virial for each local particle.
The values calculated on oxygens use the virtual charge position (m) and The values calculated on oxygens use the virtual charge position (m) and
they are stored in a separate array (ansO) for further distribution they are stored in a separate array (ansO) for further distribution
in a separate kernel. For some hydrogens located on the boundary in a separate kernel. For some hydrogens located on the boundary
of the local region, oxygens are non-local and the contribution of the local region, oxygens are non-local and the contribution
of oxygen is calculated separately in this kernel for them . of oxygen is calculated separately in this kernel for them .
---------------------------------------------------------------------- */ ---------------------------------------------------------------------- */
__kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
@ -325,7 +325,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj, const __global numtyp *restrict sp_lj,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
@ -344,7 +344,8 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_charge(); local_allocate_store_charge();
acctyp4 f, fO; acctyp3 f;
acctyp4 fO;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0; fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
acctyp energy, e_coul, virial[6], vO[6]; acctyp energy, e_coul, virial[6], vO[6];
@ -386,6 +387,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
} }
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj,factor_coul; numtyp factor_lj,factor_coul;
@ -470,7 +472,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
e_coul += prefactor*(_erfc-factor_coul); e_coul += prefactor*(_erfc-factor_coul);
} }
if (EVFLAG && vflag) { if (EVFLAG && vflag) {
acctyp4 fd; acctyp3 fd;
fd.x = delx*force_coul; fd.x = delx*force_coul;
fd.y = dely*force_coul; fd.y = dely*force_coul;
fd.z = delz*force_coul; fd.z = delz*force_coul;
@ -645,7 +647,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
@ -674,7 +676,8 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
if (EVFLAG && eflag) if (EVFLAG && eflag)
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f, fO; acctyp3 f;
acctyp4 fO;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0; fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
acctyp energy, e_coul, virial[6], vO[6]; acctyp energy, e_coul, virial[6], vO[6];
@ -717,6 +720,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
} }
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj,factor_coul; numtyp factor_lj,factor_coul;
@ -801,7 +805,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
e_coul += prefactor*(_erfc-factor_coul); e_coul += prefactor*(_erfc-factor_coul);
} }
if (EVFLAG && vflag) { if (EVFLAG && vflag) {
acctyp4 fd; acctyp3 fd;
fd.x = delx*force_coul; fd.x = delx*force_coul;
fd.y = dely*force_coul; fd.y = dely*force_coul;
fd.z = delz*force_coul; fd.z = delz*force_coul;

View File

@ -31,7 +31,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -119,7 +120,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -139,7 +140,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
mie3[tid]=mie3_in[tid]; mie3[tid]=mie3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -161,6 +162,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -33,7 +33,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -49,7 +49,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -68,6 +68,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -120,7 +121,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -141,7 +142,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
mor2[tid]=mor2_in[tid]; mor2[tid]=mor2_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -163,6 +164,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -52,7 +52,7 @@ _texture_2d( pos_tex,int4);
compute the id of the cell where the atoms belong to compute the id of the cell where the atoms belong to
x: atom coordinates x: atom coordinates
cell_id: cell ids cell_id: cell ids
particle_id: particle_id:
boxlo[0-2]: the lower left corner of the local box boxlo[0-2]: the lower left corner of the local box
ncell[xyz]: the number of cells in xyz dims ncell[xyz]: the number of cells in xyz dims
i_cell_size is the inverse cell size i_cell_size is the inverse cell size
@ -489,6 +489,10 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
#endif #endif
#define SPECIAL_DATA_PRELOAD_SIZE 3
#define UNROLL_FACTOR_LIST 4
#define UNROLL_FACTOR_SPECIAL 2
__kernel void kernel_special(__global int *dev_nbor, __kernel void kernel_special(__global int *dev_nbor,
__global int *host_nbor_list, __global int *host_nbor_list,
const __global int *host_numj, const __global int *host_numj,
@ -526,23 +530,68 @@ __kernel void kernel_special(__global int *dev_nbor,
list_end=list+fast_mul(numj,stride); list_end=list+fast_mul(numj,stride);
} }
for ( ; list<list_end; list+=stride) { #if SPECIAL_DATA_PRELOAD_SIZE > 0
int nbor=*list; tagint special_preload[SPECIAL_DATA_PRELOAD_SIZE];
tagint jtag=tag[nbor]; for (int i = 0, j = 0; (i < n3) && (j < SPECIAL_DATA_PRELOAD_SIZE); i+=UNROLL_FACTOR_SPECIAL, j++) {
special_preload[j] = special[ii + i*nt];
}
#endif
int offset=ii; for ( ; list<list_end; list+=UNROLL_FACTOR_LIST * stride) {
for (int i=0; i<n3; i++) { int nbor[UNROLL_FACTOR_LIST];
if (special[offset]==jtag) { tagint jtag[UNROLL_FACTOR_LIST];
int which = 1; __global int* list_addr[UNROLL_FACTOR_LIST];
if (i>=n1) for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
which++; list_addr[l] = list + l*stride;
if (i>=n2) nbor[l] = *list_addr[l];
which++; }
nbor=nbor ^ (which << SBBITS); for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
*list=nbor; jtag[l] = tag[nbor[l]];
}
for (int i=0, j=0; i<n3; i+=UNROLL_FACTOR_SPECIAL, j++) {
tagint special_data[UNROLL_FACTOR_SPECIAL];
int which[UNROLL_FACTOR_SPECIAL];
for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
which[c] = 1;
if (i + c < n3)
{
#if SPECIAL_DATA_PRELOAD_SIZE > 0
if ((c == 0) && (j < SPECIAL_DATA_PRELOAD_SIZE)) {
special_data[c] = special_preload[j];
}
else
#endif
special_data[c] = special[ii + (i+c)*nt];
}
} }
offset+=nt;
for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
if (i+k >= n1) {
which[k]++;
}
}
for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
if (i+k >= n2) {
which[k]++;
}
which[k] <<= SBBITS;
}
for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
if (i + c < n3) {
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
if (special_data[c] == jtag[l]) {
nbor[l]=nbor[l] ^ which[c];
}
}
}
}
}
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
*list_addr[l] = nbor[l];
} }
} }
} // if ii } // if ii
} }

View File

@ -217,7 +217,7 @@ __kernel void interp(const __global numtyp4 *restrict x_,
const grdtyp delxinv, const grdtyp delyinv, const grdtyp delxinv, const grdtyp delyinv,
const grdtyp delzinv, const int order, const grdtyp delzinv, const int order,
const int order2, const grdtyp qqrd2e_scale, const int order2, const grdtyp qqrd2e_scale,
__global acctyp4 *restrict ans) { __global acctyp3 *restrict ans) {
__local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE]; __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
__local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D]; __local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
__local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D]; __local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
@ -239,7 +239,7 @@ __kernel void interp(const __global numtyp4 *restrict x_,
fetch(qs,ii,q_tex); fetch(qs,ii,q_tex);
qs*=qqrd2e_scale; qs*=qqrd2e_scale;
acctyp4 ek; acctyp3 ek;
ek.x=(acctyp)0.0; ek.x=(acctyp)0.0;
ek.y=(acctyp)0.0; ek.y=(acctyp)0.0;
ek.z=(acctyp)0.0; ek.z=(acctyp)0.0;

View File

@ -57,6 +57,7 @@
#define MAX_SHARED_TYPES 11 #define MAX_SHARED_TYPES 11
#define MAX_BIO_SHARED_TYPES 128 #define MAX_BIO_SHARED_TYPES 128
#define PPPM_MAX_SPLINE 8 #define PPPM_MAX_SPLINE 8
#define NBOR_PREFETCH 0
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
// KERNEL MACROS // KERNEL MACROS

View File

@ -23,7 +23,7 @@
// THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR, // THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
// BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD, // BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
// BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES, // BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
// PPPM_MAX_SPLINE} // PPPM_MAX_SPLINE, NBOR_PREFETCH}
// //
//*************************************************************************/ //*************************************************************************/
@ -39,15 +39,15 @@ const char * ocl_config_names[] =
}; };
const char * ocl_config_strings[] = const char * ocl_config_strings[] =
{ {
"GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8", "GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8,0",
"NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8", "NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8,0",
"AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8", "AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8,0",
#ifdef _SINGLE_SINGLE #ifdef _SINGLE_SINGLE
"INTEL_GPU,500,8,16,1,1,4,8,1,64,64,64,64,64,8,128,8,128,8", "INTEL_GPU,500,8,32,1,1,4,8,2,128,128,128,128,64,8,128,8,128,8,2",
"APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8", "APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8,0",
#else #else
"INTEL_GPU,500,8,16,1,1,2,8,1,64,64,64,64,64,8,128,8,128,8", "INTEL_GPU,500,8,32,1,1,2,8,2,128,128,128,128,64,8,128,8,128,8,2",
"APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8", "APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8,0",
#endif #endif
"INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8" "INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8,0"
}; };

View File

@ -57,6 +57,10 @@ struct _lgpu_float2 {
float x; float y; float x; float y;
}; };
struct _lgpu_float3 {
float x; float y; float z;
};
struct _lgpu_float4 { struct _lgpu_float4 {
float x; float y; float z; float w; float x; float y; float z; float w;
}; };
@ -65,6 +69,10 @@ struct _lgpu_double2 {
double x; double y; double x; double y;
}; };
struct _lgpu_double3 {
double x; double y; double z;
};
struct _lgpu_double4 { struct _lgpu_double4 {
double x; double y; double z; double w; double x; double y; double z; double w;
}; };
@ -75,6 +83,11 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) {
return out; return out;
} }
inline std::ostream & operator<<(std::ostream &out, const _lgpu_float3 &v) {
out << v.x << " " << v.y << " " << v.z;
return out;
}
inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) { inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) {
out << v.x << " " << v.y << " " << v.z; out << v.x << " " << v.y << " " << v.z;
return out; return out;
@ -85,6 +98,11 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) {
return out; return out;
} }
inline std::ostream & operator<<(std::ostream &out, const _lgpu_double3 &v) {
out << v.x << " " << v.y << " " << v.z;
return out;
}
inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
out << v.x << " " << v.y << " " << v.z; out << v.x << " " << v.y << " " << v.z;
return out; return out;
@ -97,8 +115,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
#define PRECISION float #define PRECISION float
#define ACC_PRECISION double #define ACC_PRECISION double
#define numtyp2 _lgpu_float2 #define numtyp2 _lgpu_float2
#define numtyp3 _lgpu_float3
#define numtyp4 _lgpu_float4 #define numtyp4 _lgpu_float4
#define acctyp2 _lgpu_double2 #define acctyp2 _lgpu_double2
#define acctyp3 _lgpu_double3
#define acctyp4 _lgpu_double4 #define acctyp4 _lgpu_double4
#endif #endif
@ -107,8 +127,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
#define PRECISION double #define PRECISION double
#define ACC_PRECISION double #define ACC_PRECISION double
#define numtyp2 _lgpu_double2 #define numtyp2 _lgpu_double2
#define numtyp3 _lgpu_double3
#define numtyp4 _lgpu_double4 #define numtyp4 _lgpu_double4
#define acctyp2 _lgpu_double2 #define acctyp2 _lgpu_double2
#define acctyp3 _lgpu_double3
#define acctyp4 _lgpu_double4 #define acctyp4 _lgpu_double4
#endif #endif
@ -117,8 +139,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
#define PRECISION float #define PRECISION float
#define ACC_PRECISION float #define ACC_PRECISION float
#define numtyp2 _lgpu_float2 #define numtyp2 _lgpu_float2
#define numtyp3 _lgpu_float3
#define numtyp4 _lgpu_float4 #define numtyp4 _lgpu_float4
#define acctyp2 _lgpu_float2 #define acctyp2 _lgpu_float2
#define acctyp3 _lgpu_float3
#define acctyp4 _lgpu_float4 #define acctyp4 _lgpu_float4
#endif #endif

View File

@ -93,6 +93,13 @@
// Definition: Maximum order for splines in PPPM // Definition: Maximum order for splines in PPPM
// Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE // Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
// //
// NBOR_PREFETCH
// Definition: Control use of prefetch for neighbor indices
// 0 = No prefetch
// 1 = Prefetch using standard API
// 2 = Prefetch using Intel intrinsics
// Restrictions: NBOR_PREFETCH forced to 0 when LAL_DISABLE_PREFETCH
// is defined in library build
//*************************************************************************/ //*************************************************************************/
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
@ -101,6 +108,7 @@
#if defined(NV_KERNEL) || defined(USE_HIP) #if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_pre_cuda_hip.h" #include "lal_pre_cuda_hip.h"
#define ucl_prefetch(p)
#define ucl_pow pow #define ucl_pow pow
#endif #endif
@ -169,7 +177,7 @@
#define ucl_abs fabs #define ucl_abs fabs
#define ucl_erfc erfc #define ucl_erfc erfc
#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE) #if (FAST_MATH > 0) && !defined(_DOUBLE_DOUBLE)
#define ucl_exp native_exp #define ucl_exp native_exp
#define ucl_pow pow #define ucl_pow pow
@ -285,6 +293,55 @@
#define simd_size() SIMD_SIZE #define simd_size() SIMD_SIZE
#endif #endif
// -------------------------------------------------------------------------
// OPENCL KERNEL MACROS - PREFETCH
// -------------------------------------------------------------------------
#if (NBOR_PREFETCH == 0)
#define ucl_prefetch(p)
#endif
#if (NBOR_PREFETCH == 1)
inline void ucl_prefetch(const __global int *p) {
prefetch(p, 1);
}
#endif
#if (NBOR_PREFETCH == 2)
// Load message caching control
enum LSC_LDCC {
LSC_LDCC_DEFAULT,
LSC_LDCC_L1UC_L3UC, //1 Override to L1 uncached and L3 uncached
LSC_LDCC_L1UC_L3C, //1 Override to L1 uncached and L3 cached
LSC_LDCC_L1C_L3UC, //1 Override to L1 cached and L3 uncached
LSC_LDCC_L1C_L3C, //1 Override to L1 cached and L3 cached
LSC_LDCC_L1S_L3UC, //1 Override to L1 streaming load and L3 uncached
LSC_LDCC_L1S_L3C, //1 Override to L1 streaming load and L3 cached
LSC_LDCC_L1IAR_L3C, //1 Override to L1 invalidate-after-read, and L3 cached
};
void __builtin_IB_lsc_prefetch_global_uint(const __global uint *base,
int elemOff,
enum LSC_LDCC cacheOpt); //D32V1
inline void ucl_prefetch(const __global int *p) {
__builtin_IB_lsc_prefetch_global_uint((const __global uint *)p, 0,
LSC_LDCC_L1C_L3UC);
}
#endif
struct _lgpu_float3 {
float x; float y; float z;
};
struct _lgpu_double3 {
double x; double y; double z;
};
#ifdef _SINGLE_SINGLE
#define acctyp3 struct _lgpu_float3
#else
#define acctyp3 struct _lgpu_double3
#endif
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
// END OPENCL DEFINITIONS // END OPENCL DEFINITIONS
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
@ -301,6 +358,9 @@
#define numtyp4 double4 #define numtyp4 double4
#define acctyp double #define acctyp double
#define acctyp2 double2 #define acctyp2 double2
#ifndef acctyp3
#define acctyp3 double3
#endif
#define acctyp4 double4 #define acctyp4 double4
#endif #endif
@ -310,6 +370,9 @@
#define numtyp4 float4 #define numtyp4 float4
#define acctyp double #define acctyp double
#define acctyp2 double2 #define acctyp2 double2
#ifndef acctyp3
#define acctyp3 double3
#endif
#define acctyp4 double4 #define acctyp4 double4
#endif #endif
@ -319,6 +382,9 @@
#define numtyp4 float4 #define numtyp4 float4
#define acctyp float #define acctyp float
#define acctyp2 float2 #define acctyp2 float2
#ifndef acctyp3
#define acctyp3 float3
#endif
#define acctyp4 float4 #define acctyp4 float4
#endif #endif

View File

@ -32,6 +32,9 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
return ans; return ans;
} }
#ifdef INTEL_OCL
__attribute__((intel_reqd_sub_group_size(16)))
#endif
__kernel void k_resquared(const __global numtyp4 *restrict x_, __kernel void k_resquared(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict q, const __global numtyp4 *restrict q,
const __global numtyp4 *restrict shape, const __global numtyp4 *restrict shape,
@ -41,7 +44,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
const int ntypes, const int ntypes,
const __global int *dev_nbor, const __global int *dev_nbor,
const int stride, const int stride,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
const int astride, const int astride,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global int *restrict err_flag, __global int *restrict err_flag,
@ -62,7 +65,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0; const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0;
const numtyp cr60=ucl_cbrt((numtyp)60.0); const numtyp cr60=ucl_cbrt((numtyp)60.0);
acctyp4 f, tor; acctyp3 f, tor;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
@ -122,6 +125,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_nbor+nbor+n_stride);
int j=dev_nbor[nbor]; int j=dev_nbor[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;

View File

@ -105,28 +105,32 @@ void re_gpu_clear() {
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success, int **ilist, int **numj, const double cpu_time, bool &success,
double **host_quat); const int *ellipsoid, const EllipsoidBonus *bonus);
int** re_gpu_compute_n(const int ago, const int inum_full, const int nall, int** re_gpu_compute_n(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, tagint **special, double *subhi, tagint *tag, int **nspecial,
const bool eflag, const bool vflag, const bool eatom, tagint **special, const bool eflag, const bool vflag,
const bool vatom, int &host_start, int **ilist, const bool eatom, const bool vatom, int &host_start,
int **jnum, const double cpu_time, bool &success, int **ilist, int **jnum, const double cpu_time,
double **host_quat) { bool &success, const int *ellipsoid,
const void *bonus) {
return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
tag, nspecial, special, eflag, vflag, eatom, vatom, tag, nspecial, special, eflag, vflag, eatom, vatom,
host_start, ilist, jnum, cpu_time, success, host_quat); host_start, ilist, jnum, cpu_time, success, ellipsoid,
static_cast<const EllipsoidBonus *>(bonus));
} }
int * re_gpu_compute(const int ago, const int inum_full, const int nall, int * re_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double **host_quat) { const double cpu_time, bool &success,
const int *ellipsoid, const void *bonus) {
return REMF.compute(ago, inum_full, nall, host_x, host_type, ilist, return REMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
numj, firstneigh, eflag, vflag, eatom, vatom, host_start, numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
cpu_time, success, host_quat); cpu_time, success, ellipsoid,
static_cast<const EllipsoidBonus *>(bonus));
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
@ -135,4 +139,3 @@ int * re_gpu_compute(const int ago, const int inum_full, const int nall,
double re_gpu_bytes() { double re_gpu_bytes() {
return REMF.host_memory_usage(); return REMF.host_memory_usage();
} }

View File

@ -86,7 +86,7 @@
ap1+=astride; \ ap1+=astride; \
} \ } \
} \ } \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -131,7 +131,7 @@
ap1+=astride; \ ap1+=astride; \
} \ } \
} \ } \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -154,7 +154,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
const int ntypes, const int ntypes,
const __global int *dev_nbor, const __global int *dev_nbor,
const int stride, const int stride,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
const int astride, const int astride,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global int *restrict err_flag, __global int *restrict err_flag,
@ -180,7 +180,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
const numtyp solv_f_r = const numtyp solv_f_r =
(numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
acctyp4 f, tor; acctyp3 f, tor;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
@ -216,6 +216,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_nbor+nbor+n_stride);
int j=dev_nbor[nbor]; int j=dev_nbor[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -409,7 +410,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
const int ntypes, const int ntypes,
const __global int *dev_nbor, const __global int *dev_nbor,
const int stride, const int stride,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global int *restrict err_flag, __global int *restrict err_flag,
const int eflag, const int vflag, const int eflag, const int vflag,
@ -435,7 +436,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
const numtyp solv_f_r = const numtyp solv_f_r =
(numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -454,6 +455,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_nbor+nbor+n_stride);
int i=dev_nbor[nbor]; int i=dev_nbor[nbor];
factor_lj = sp_lj[sbmask(i)]; factor_lj = sp_lj[sbmask(i)];
i &= NEIGHMASK; i &= NEIGHMASK;
@ -610,7 +612,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
const __global numtyp *restrict gum, const __global numtyp *restrict gum,
const int stride, const int stride,
const __global int *dev_ij, const __global int *dev_ij,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global int *restrict err_flag, __global int *restrict err_flag,
const int eflag, const int vflag, const int start, const int eflag, const int vflag, const int start,
@ -628,7 +630,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
sp_lj[2]=gum[2]; sp_lj[2]=gum[2];
sp_lj[3]=gum[3]; sp_lj[3]=gum[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -647,6 +649,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_ij+nbor+n_stride);
int j=dev_ij[nbor]; int j=dev_ij[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -697,7 +700,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict gum, const __global numtyp *restrict gum,
const int stride, const int stride,
const __global int *dev_ij, const __global int *dev_ij,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global int *restrict err_flag, __global int *restrict err_flag,
const int eflag, const int vflag, const int eflag, const int vflag,
@ -721,7 +724,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -743,6 +746,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_ij+nbor+n_stride);
int j=dev_ij[nbor]; int j=dev_ij[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -32,7 +32,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -48,7 +48,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -67,6 +67,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -119,7 +120,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -137,7 +138,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
coeff[tid]=coeff_in[tid]; coeff[tid]=coeff_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -159,6 +160,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -57,7 +57,7 @@ _texture( sw3_tex,int4);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -116,7 +116,7 @@ _texture( sw3_tex,int4);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -194,7 +194,7 @@ _texture( sw3_tex,int4);
if (t_per_atom>1) \ if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -265,7 +265,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
const __global numtyp4 * restrict c_14, const __global numtyp4 * restrict c_14,
const __global numtyp2 * restrict c_56, const __global numtyp2 * restrict c_56,
const int ntypes, const __global int * dev_nbor, const int ntypes, const __global int * dev_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
@ -282,7 +282,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
if (EVFLAG && eflag) pre_sw_c56=c_56[ONETYPE]; if (EVFLAG && eflag) pre_sw_c56=c_56[ONETYPE];
#endif #endif
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -461,7 +461,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
const __global numtyp2 *restrict sw_pre3, const __global numtyp2 *restrict sw_pre3,
const int ntypes, const int ntypes,
const __global int * dev_nbor, const __global int * dev_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -480,7 +480,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y; const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
#endif #endif
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -579,7 +579,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
const __global numtyp2 *restrict sw_pre3, const __global numtyp2 *restrict sw_pre3,
const int ntypes, const __global int * dev_nbor, const int ntypes, const __global int * dev_nbor,
const __global int * dev_ilist, const __global int * dev_ilist,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -598,7 +598,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y; const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
#endif #endif
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -701,7 +701,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
const __global numtyp2 *restrict sw_pre3, const __global numtyp2 *restrict sw_pre3,
const int ntypes, const __global int * dev_nbor, const int ntypes, const __global int * dev_nbor,
const __global int * dev_ilist, const __global int * dev_ilist,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -720,7 +720,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y; const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
#endif #endif
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {

View File

@ -49,7 +49,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
@ -66,7 +66,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -87,6 +87,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -146,7 +147,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
@ -165,7 +166,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
cutsq[tid]=cutsq_in[tid]; cutsq[tid]=cutsq_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -189,6 +190,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -251,7 +253,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
@ -268,7 +270,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -289,6 +291,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -352,7 +355,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -371,7 +374,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
cutsq[tid]=cutsq_in[tid]; cutsq[tid]=cutsq_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -395,6 +398,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -461,7 +465,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
@ -478,7 +482,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -499,6 +503,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -569,7 +574,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
const __global numtyp* sp_lj_in, const __global numtyp* sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *ans, __global acctyp3 *ans,
__global acctyp *engv, __global acctyp *engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -588,7 +593,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
cutsq[tid]=cutsq_in[tid]; cutsq[tid]=cutsq_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -611,6 +616,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -686,7 +692,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
const __global numtyp* sp_lj_in, const __global numtyp* sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *ans, __global acctyp3 *ans,
__global acctyp *engv, __global acctyp *engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
@ -703,7 +709,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -724,6 +730,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -792,7 +799,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
const __global numtyp* sp_lj_in, const __global numtyp* sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *ans, __global acctyp3 *ans,
__global acctyp *engv, __global acctyp *engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -811,7 +818,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
cutsq[tid]=cutsq_in[tid]; cutsq[tid]=cutsq_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -835,6 +842,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -63,7 +63,7 @@ _texture_2d( pos_tex,int4);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -132,7 +132,7 @@ _texture_2d( pos_tex,int4);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -211,7 +211,7 @@ _texture_2d( pos_tex,int4);
if (t_per_atom>1) \ if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -448,7 +448,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
const __global int *restrict elem2param, const __global int *restrict elem2param,
const int nelements, const int nparams, const int nelements, const int nparams,
const __global int * dev_nbor, const __global int * dev_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -472,7 +472,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
const numtyp ijparam_bigd = ts2_in[ONETYPE3].w; const numtyp ijparam_bigd = ts2_in[ONETYPE3].w;
#endif #endif
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -553,7 +553,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
const __global acctyp2 *restrict zetaij, const __global acctyp2 *restrict zetaij,
const __global acctyp *restrict zetaij_e, const __global acctyp *restrict zetaij_e,
const __global int * dev_nbor, const __global int * dev_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -585,7 +585,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
const numtyp gamma = ts4_in[ONETYPE3].w; const numtyp gamma = ts4_in[ONETYPE3].w;
#endif #endif
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -728,7 +728,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
const __global acctyp *restrict zetaij_e, const __global acctyp *restrict zetaij_e,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_ilist, const __global int * dev_ilist,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -760,7 +760,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
const numtyp gamma = ts4_in[ONETYPE3].w; const numtyp gamma = ts4_in[ONETYPE3].w;
#endif #endif
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -950,7 +950,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
const __global acctyp *restrict zetaij_e, const __global acctyp *restrict zetaij_e,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_ilist, const __global int * dev_ilist,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -982,7 +982,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
const numtyp gamma = ts4_in[ONETYPE3].w; const numtyp gamma = ts4_in[ONETYPE3].w;
#endif #endif
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {

View File

@ -63,7 +63,7 @@ _texture_2d( pos_tex,int4);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -131,7 +131,7 @@ _texture_2d( pos_tex,int4);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -209,7 +209,7 @@ _texture_2d( pos_tex,int4);
if (t_per_atom>1) \ if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -417,7 +417,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
const __global int *restrict elem2param, const __global int *restrict elem2param,
const int nelements, const int nparams, const int nelements, const int nparams,
const __global int * dev_nbor, const __global int * dev_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -434,7 +434,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
ts2[tid]=ts2_in[tid]; ts2[tid]=ts2_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -511,7 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
const int nelements, const int nparams, const int nelements, const int nparams,
const __global acctyp4 *restrict zetaij, const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor, const __global int * dev_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -535,7 +535,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
ts5[tid]=ts5_in[tid]; ts5[tid]=ts5_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -676,7 +676,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij, const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_ilist, const __global int * dev_ilist,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -700,7 +700,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
ts5[tid]=ts5_in[tid]; ts5[tid]=ts5_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -890,7 +890,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij, const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_ilist, const __global int * dev_ilist,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -914,7 +914,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
ts5[tid]=ts5_in[tid]; ts5[tid]=ts5_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {

View File

@ -81,7 +81,7 @@ _texture( ts6_tex,int4);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -149,7 +149,7 @@ _texture( ts6_tex,int4);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -227,7 +227,7 @@ _texture( ts6_tex,int4);
if (t_per_atom>1) \ if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -443,7 +443,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
const __global int *restrict elem2param, const __global int *restrict elem2param,
const int nelements, const int nparams, const int nelements, const int nparams,
const __global int * dev_nbor, const __global int * dev_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -462,7 +462,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
ts6[tid]=ts6_in[tid]; ts6[tid]=ts6_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -544,7 +544,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
const int nelements, const int nparams, const int nelements, const int nparams,
const __global acctyp4 *restrict zetaij, const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor, const __global int * dev_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -566,7 +566,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
ts4[tid]=ts4_in[tid]; ts4[tid]=ts4_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -703,7 +703,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij, const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_ilist, const __global int * dev_ilist,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -725,7 +725,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
ts4[tid]=ts4_in[tid]; ts4[tid]=ts4_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -908,7 +908,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij, const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_ilist, const __global int * dev_ilist,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -930,7 +930,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
ts4[tid]=ts4_in[tid]; ts4[tid]=ts4_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {

View File

@ -33,7 +33,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj, const __global numtyp *restrict sp_lj,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -43,7 +43,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_pair(); local_allocate_store_pair();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -61,6 +61,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -109,7 +110,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -130,7 +131,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
uf3[tid]=uf3_in[tid]; uf3[tid]=uf3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -151,6 +152,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -73,7 +73,7 @@ _texture( param5_tex,int4);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -132,7 +132,7 @@ _texture( param5_tex,int4);
} \ } \
} \ } \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -210,7 +210,7 @@ _texture( param5_tex,int4);
if (t_per_atom>1) \ if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) { \ if (offset==0 && ii<inum) { \
acctyp4 old=ans[ii]; \ acctyp3 old=ans[ii]; \
old.x+=f.x; \ old.x+=f.x; \
old.y+=f.y; \ old.y+=f.y; \
old.z+=f.z; \ old.z+=f.z; \
@ -247,6 +247,7 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
const int out_stride=nbor_pitch*t_per_atom-t_per_atom; const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) { for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
ucl_prefetch(dev_packed+nbor+nbor_pitch);
int sj=dev_packed[nbor]; int sj=dev_packed[nbor];
int j = sj & NEIGHMASK; int j = sj & NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -283,7 +284,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
const __global int *restrict elem2param, const __global int *restrict elem2param,
const int nelements, const int nelements,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int ev_stride) { const int nbor_pitch, const int ev_stride) {
@ -291,7 +292,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
local_allocate_store_pair(); local_allocate_store_pair();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -313,6 +314,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
itype=map[itype]; itype=map[itype];
for ( ; nbor<nbor_end; nbor+=nbor_pitch) { for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
ucl_prefetch(dev_packed+nbor+nbor_pitch);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -489,7 +491,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
const __global int *restrict elem2param, const __global int *restrict elem2param,
const int nelements, const int nelements,
const __global int * dev_nbor, const __global int * dev_nbor,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -504,7 +506,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
local_allocate_store_three(); local_allocate_store_three();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -612,7 +614,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
const int nelements, const int nelements,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_ilist, const __global int * dev_ilist,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -627,7 +629,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
local_allocate_store_three(); local_allocate_store_three();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -743,7 +745,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
const int nelements, const int nelements,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_ilist, const __global int * dev_ilist,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -758,7 +760,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
local_allocate_store_three(); local_allocate_store_three();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {

View File

@ -30,7 +30,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -46,7 +46,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -65,6 +65,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -118,7 +119,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -136,7 +137,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
coeff[tid]=coeff_in[tid]; coeff[tid]=coeff_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -158,6 +159,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -40,7 +40,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
@ -57,7 +57,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -77,6 +77,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
@ -131,7 +132,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
@ -150,7 +151,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
coeff[tid]=coeff_in[tid]; coeff[tid]=coeff_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -173,6 +174,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];

View File

@ -88,7 +88,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
const int lj_types, const int lj_types,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -98,7 +98,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
int n_stride; int n_stride;
local_allocate_store_pair(); local_allocate_store_pair();
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -116,6 +116,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -179,7 +180,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
const numtyp cut_inner, const numtyp cut_inner,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp3 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
@ -198,7 +199,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
coeff3[tid]=coeff3_in[tid]; coeff3[tid]=coeff3_in[tid];
} }
acctyp4 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
@ -219,6 +220,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
j &= NEIGHMASK; j &= NEIGHMASK;

View File

@ -290,6 +290,20 @@ void FixGPU::init()
void FixGPU::setup(int vflag) void FixGPU::setup(int vflag)
{ {
// See if we should overlap topology list builds on CPU with work on GPU
int overlap_topo = 0;
if ((atom->molecular != Atom::ATOMIC)) {
PairHybrid *ph = reinterpret_cast<PairHybrid *>(force->pair_match("^hybrid",0));
if (ph) {
for (int isub=0; isub < ph->nstyles; ++isub) {
if (force->pair_match("gpu",0,isub)) overlap_topo = 1;
}
} else {
if (force->pair_match("gpu",0)) overlap_topo = 1;
}
}
if (overlap_topo) neighbor->set_overlap_topo(1);
if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
if (neighbor->exclude_setting() != 0) if (neighbor->exclude_setting() != 0)
error->all(FLERR, "Cannot use neigh_modify exclude with GPU neighbor builds"); error->all(FLERR, "Cannot use neigh_modify exclude with GPU neighbor builds");

View File

@ -243,12 +243,7 @@ void FixNVEAsphereGPU::initial_integrate(int /*vflag*/)
// update angular momentum by 1/2 step // update angular momentum by 1/2 step
if (igroup == 0) { if (igroup == 0) {
#if (LAL_USE_OMP_SIMD == 1) #if (LAL_USE_OMP_SIMD == 1)
// Workaround for compiler bug #pragma omp simd
#ifdef __INTEL_COMPILER
#pragma simd
#else
#pragma omp simd
#endif
#endif #endif
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
double *quat = bonus[ellipsoid[i]].quat; double *quat = bonus[ellipsoid[i]].quat;
@ -257,12 +252,7 @@ void FixNVEAsphereGPU::initial_integrate(int /*vflag*/)
} }
} else { } else {
#if (LAL_USE_OMP_SIMD == 1) #if (LAL_USE_OMP_SIMD == 1)
// Workaround for compiler bug #pragma omp simd
#ifdef __INTEL_COMPILER
#pragma simd
#else
#pragma omp simd
#endif
#endif #endif
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
if (mask[i] & groupbit) { if (mask[i] & groupbit) {

View File

@ -155,6 +155,15 @@ PairAmoebaGPU::~PairAmoebaGPU()
amoeba_gpu_clear(); amoeba_gpu_clear();
} }
/* ---------------------------------------------------------------------- */
void PairAmoebaGPU::compute(int eflag, int vflag)
{
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
PairAmoeba::compute(eflag, vflag);
}
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
init specific to this pair style init specific to this pair style
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */

View File

@ -28,6 +28,7 @@ class PairAmoebaGPU : public PairAmoeba {
public: public:
PairAmoebaGPU(LAMMPS *lmp); PairAmoebaGPU(LAMMPS *lmp);
~PairAmoebaGPU() override; ~PairAmoebaGPU() override;
void compute(int, int) override;
void init_style() override; void init_style() override;
double memory_usage() override; double memory_usage() override;

View File

@ -109,6 +109,8 @@ void PairBeckGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -129,6 +129,8 @@ void PairBornCoulLongCSGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -123,6 +123,8 @@ void PairBornCoulLongGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -117,6 +117,8 @@ void PairBornCoulWolfCSGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -114,6 +114,8 @@ void PairBornCoulWolfGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -109,6 +109,8 @@ void PairBornGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -111,6 +111,8 @@ void PairBuckCoulCutGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -120,6 +120,8 @@ void PairBuckCoulLongGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -107,6 +107,8 @@ void PairBuckGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -109,6 +109,8 @@ void PairColloidGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -108,6 +108,8 @@ void PairCoulCutGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -109,6 +109,8 @@ void PairCoulDebyeGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -118,6 +118,8 @@ void PairCoulDSFGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

View File

@ -123,6 +123,8 @@ void PairCoulLongCSGPU::compute(int eflag, int vflag)
} }
if (!success) error->one(FLERR, "Insufficient memory on accelerator"); if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) { if (host_start < inum) {
cpu_time = platform::walltime(); cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);

Some files were not shown because too many files have changed in this diff Show More