Misc Improvements to GPU Package
- Optimizations for molecular systems - Improved kernel performance and greater CPU overlap - Reduced GPU to CPU communications for discrete devices - Switch classic Intel makefiles to use LLVM-based compilers - Prefetch optimizations supported for OpenCL - Optimized data repack for quaternions
This commit is contained in:
@ -319,7 +319,7 @@ CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH,
|
||||
THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
|
||||
BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
|
||||
BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
|
||||
PPPM_MAX_SPLINE.
|
||||
PPPM_MAX_SPLINE, NBOR_PREFETCH.
|
||||
|
||||
CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX
|
||||
(NVIDIA) or OpenCL extensions (Intel) should be used for horizontal
|
||||
|
||||
@ -12,13 +12,12 @@ EXTRAMAKE = Makefile.lammps.opencl
|
||||
LMP_INC = -DLAMMPS_SMALLBIG
|
||||
|
||||
OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/
|
||||
CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -fp-model fast=2 -no-prec-div \
|
||||
-qoverride-limits
|
||||
OCL_CPP = mpiicpc -std=c++11 -diag-disable=10441 -DMPICH_IGNORE_CXX_SEEK \
|
||||
CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -ffast-math -freciprocal-math
|
||||
OCL_CPP = mpiicpc -cxx=icpx -std=c++11 -DMPICH_IGNORE_CXX_SEEK \
|
||||
$(LMP_INC) $(OCL_INC) $(CPP_OPT)
|
||||
OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
|
||||
OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -DGERYON_NO_OCL_MARKERS
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./
|
||||
|
||||
28
lib/gpu/Makefile.oneapi_prof
Normal file
28
lib/gpu/Makefile.oneapi_prof
Normal file
@ -0,0 +1,28 @@
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Linux Makefile for Intel oneAPI - Mixed precision (with timing enabled)
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
# which file will be copied to Makefile.lammps
|
||||
|
||||
EXTRAMAKE = Makefile.lammps.opencl
|
||||
|
||||
# this setting should match LAMMPS Makefile
|
||||
# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
|
||||
|
||||
LMP_INC = -DLAMMPS_SMALLBIG
|
||||
|
||||
OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/
|
||||
CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -ffast-math -freciprocal-math
|
||||
OCL_CPP = mpiicpc -cxx=icpx -std=c++11 -DMPICH_IGNORE_CXX_SEEK \
|
||||
$(LMP_INC) $(OCL_INC) $(CPP_OPT)
|
||||
OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./
|
||||
LIB_DIR = ./
|
||||
AR = ar
|
||||
BSH = /bin/sh
|
||||
|
||||
include Opencl.makefile
|
||||
@ -266,6 +266,7 @@ LAL_SERIALIZE_INIT Force serialization of initialization and compilation
|
||||
for multiple MPI tasks sharing the same accelerator.
|
||||
Some accelerator API implementations have had issues
|
||||
with temporary file conflicts in the past.
|
||||
LAL_DISABLE_PREFETCH Disable prefetch in kernels
|
||||
GERYON_FORCE_SHARED_MAIN_MEM_ON Should only be used for builds where the
|
||||
accelerator is guaranteed to share physical
|
||||
main memory with the host (e.g. integrated
|
||||
|
||||
@ -429,7 +429,7 @@ void UCL_Device::clear() {
|
||||
CU_SAFE_CALL_NS(cuCtxSetCurrent(_old_context));
|
||||
CU_SAFE_CALL_NS(cuDevicePrimaryCtxRelease(_cu_device));
|
||||
#else
|
||||
cuCtxDestroy(_context));
|
||||
cuCtxDestroy(_context);
|
||||
#endif
|
||||
}
|
||||
_device=-1;
|
||||
|
||||
@ -113,7 +113,7 @@ _texture( q_tex,int2);
|
||||
dufld[5]=red_acc[5][tid]; \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 t; \
|
||||
acctyp3 t; \
|
||||
t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \
|
||||
(numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \
|
||||
t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \
|
||||
@ -147,7 +147,7 @@ _texture( q_tex,int2);
|
||||
_fieldp[5]=red_acc[5][tid]; \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 f, fp; \
|
||||
acctyp3 f, fp; \
|
||||
f.x = _fieldp[0]; \
|
||||
f.y = _fieldp[1]; \
|
||||
f.z = _fieldp[2]; \
|
||||
@ -174,7 +174,7 @@ _texture( q_tex,int2);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -254,7 +254,7 @@ _texture( q_tex,int2);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 t; \
|
||||
acctyp3 t; \
|
||||
t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \
|
||||
(numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \
|
||||
t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \
|
||||
@ -277,7 +277,7 @@ _texture( q_tex,int2);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 f, fp; \
|
||||
acctyp3 f, fp; \
|
||||
f.x = _fieldp[0]; \
|
||||
f.y = _fieldp[1]; \
|
||||
f.z = _fieldp[2]; \
|
||||
@ -302,7 +302,7 @@ _texture( q_tex,int2);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -391,7 +391,7 @@ _texture( q_tex,int2);
|
||||
if (t_per_atom>1) \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -416,9 +416,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp4 *restrict tep,
|
||||
__global acctyp3 *restrict tep,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
const int t_per_atom, const numtyp aewald,
|
||||
@ -431,7 +431,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -440,7 +440,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
||||
for (int l=0; l<6; l++) virial[l]=(acctyp)0;
|
||||
}
|
||||
|
||||
acctyp4 tq;
|
||||
acctyp3 tq;
|
||||
tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
|
||||
|
||||
const __global numtyp4* polar1 = &extra[0];
|
||||
@ -695,7 +695,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_short_nbor,
|
||||
__global acctyp4 *restrict fieldp,
|
||||
__global acctyp3 *restrict fieldp,
|
||||
const int inum, const int nall,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
const numtyp aewald, const numtyp off2,
|
||||
@ -889,7 +889,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_short_nbor,
|
||||
__global acctyp4 *restrict fieldp,
|
||||
__global acctyp3 *restrict fieldp,
|
||||
const int inum, const int nall,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
const numtyp aewald, const numtyp off2,
|
||||
@ -1052,9 +1052,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp4 *restrict tep,
|
||||
__global acctyp3 *restrict tep,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch, const int t_per_atom,
|
||||
const numtyp aewald, const numtyp felec,
|
||||
@ -1067,7 +1067,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
|
||||
@ -28,9 +28,9 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int AnswerT::bytes_per_atom() const {
|
||||
int bytes=11*sizeof(acctyp);
|
||||
int bytes=10*sizeof(acctyp);
|
||||
if (_rot)
|
||||
bytes+=4*sizeof(acctyp);
|
||||
bytes+=3*sizeof(acctyp);
|
||||
if (_charge)
|
||||
bytes+=sizeof(acctyp);
|
||||
return bytes;
|
||||
@ -42,9 +42,9 @@ bool AnswerT::alloc(const int inum) {
|
||||
|
||||
bool success=true;
|
||||
|
||||
_ans_fields=4;
|
||||
_ans_fields=3;
|
||||
if (_rot)
|
||||
_ans_fields+=4;
|
||||
_ans_fields+=3;
|
||||
|
||||
// --------------------------- Device allocations
|
||||
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
|
||||
@ -134,11 +134,11 @@ void AnswerT::clear() {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double AnswerT::host_memory_usage() const {
|
||||
int atom_bytes=4;
|
||||
int atom_bytes=3;
|
||||
if (_charge)
|
||||
atom_bytes+=1;
|
||||
if (_rot)
|
||||
atom_bytes+=4;
|
||||
atom_bytes+=3;
|
||||
int ans_bytes=atom_bytes+_ev_fields;
|
||||
return ans_bytes*(_max_local)*sizeof(acctyp)+
|
||||
sizeof(Answer<numtyp,acctyp>);
|
||||
@ -169,9 +169,9 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
|
||||
if (csize>0)
|
||||
engv.update_host(_ev_stride*csize,true);
|
||||
if (_rot)
|
||||
force.update_host(_inum*4*2,true);
|
||||
force.update_host(_inum*3*2,true);
|
||||
else
|
||||
force.update_host(_inum*4,true);
|
||||
force.update_host(_inum*3,true);
|
||||
time_answer.stop();
|
||||
|
||||
#ifndef GERYON_OCL_FLUSH
|
||||
@ -298,10 +298,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
template <class numtyp, class acctyp>
|
||||
void AnswerT::get_answers(double **f, double **tor) {
|
||||
if (_ilist==nullptr) {
|
||||
typedef struct { double x,y,z; } vec3d;
|
||||
typedef struct { acctyp x,y,z,w; } vec4d_t;
|
||||
auto fp=reinterpret_cast<vec3d*>(&(f[0][0]));
|
||||
auto forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
|
||||
auto fp=reinterpret_cast<double*>(&(f[0][0]));
|
||||
|
||||
#if (LAL_USE_OMP == 1)
|
||||
#pragma omp parallel
|
||||
@ -310,27 +307,21 @@ void AnswerT::get_answers(double **f, double **tor) {
|
||||
#if (LAL_USE_OMP == 1)
|
||||
const int nthreads = omp_get_num_threads();
|
||||
const int tid = omp_get_thread_num();
|
||||
const int idelta = _inum / nthreads + 1;
|
||||
const int idelta = _inum*3 / nthreads + 1;
|
||||
const int ifrom = tid * idelta;
|
||||
const int ito = std::min(ifrom + idelta, _inum);
|
||||
const int ito = std::min(ifrom + idelta, _inum*3);
|
||||
#else
|
||||
const int ifrom = 0;
|
||||
const int ito = _inum;
|
||||
const int ito = _inum*3;
|
||||
#endif
|
||||
|
||||
for (int i=ifrom; i<ito; i++) {
|
||||
fp[i].x+=forcep[i].x;
|
||||
fp[i].y+=forcep[i].y;
|
||||
fp[i].z+=forcep[i].z;
|
||||
}
|
||||
for (int i=ifrom; i<ito; i++)
|
||||
fp[i]+=force[i];
|
||||
if (_rot) {
|
||||
auto torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
|
||||
auto torquep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
|
||||
for (int i=ifrom; i<ito; i++) {
|
||||
torp[i].x+=torquep[i].x;
|
||||
torp[i].y+=torquep[i].y;
|
||||
torp[i].z+=torquep[i].z;
|
||||
}
|
||||
auto torp=reinterpret_cast<double*>(&(tor[0][0]));
|
||||
auto torquep=&(force[_inum*3]);
|
||||
for (int i=ifrom; i<ito; i++)
|
||||
torp[i]+=torquep[i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -344,7 +335,7 @@ void AnswerT::get_answers(double **f, double **tor) {
|
||||
const int idelta = _inum / nthreads + 1;
|
||||
const int ifrom = tid * idelta;
|
||||
const int ito = std::min(ifrom + idelta, _inum);
|
||||
int fl=ifrom*4;
|
||||
int fl=ifrom*3;
|
||||
#else
|
||||
const int ifrom = 0;
|
||||
const int ito = _inum;
|
||||
@ -356,16 +347,16 @@ void AnswerT::get_answers(double **f, double **tor) {
|
||||
f[ii][0]+=force[fl];
|
||||
f[ii][1]+=force[fl+1];
|
||||
f[ii][2]+=force[fl+2];
|
||||
fl+=4;
|
||||
fl+=3;
|
||||
}
|
||||
if (_rot) {
|
||||
fl=_inum*4 + ifrom*4;
|
||||
fl=_inum*3 + ifrom*3;
|
||||
for (int i=ifrom; i<ito; i++) {
|
||||
int ii=_ilist[i];
|
||||
tor[ii][0]+=force[fl];
|
||||
tor[ii][1]+=force[fl+1];
|
||||
tor[ii][2]+=force[fl+2];
|
||||
fl+=4;
|
||||
fl+=3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -114,7 +114,7 @@ bool AtomT::alloc(const int nall) {
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
gpu_bytes+=q.device.row_bytes();
|
||||
}
|
||||
if (_rot && !_host_view) {
|
||||
if (_rot) {
|
||||
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
gpu_bytes+=quat.device.row_bytes();
|
||||
@ -182,12 +182,10 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
||||
if (rot && !_rot) {
|
||||
_rot=true;
|
||||
_other=true;
|
||||
if (!_host_view) {
|
||||
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
gpu_bytes+=quat.device.row_bytes();
|
||||
}
|
||||
}
|
||||
|
||||
if (vel && !_vel) {
|
||||
_vel=true;
|
||||
@ -451,7 +449,7 @@ template <class numtyp, class acctyp>
|
||||
void AtomT::compile_kernels(UCL_Device &dev) {
|
||||
std::string flags = "";
|
||||
atom_program=new UCL_Program(dev);
|
||||
atom_program->load_string(atom,flags,nullptr,screen);
|
||||
atom_program->load_string(atom,flags.c_str(),nullptr,stderr);
|
||||
k_cast_x.set_function(*atom_program,"kernel_cast_x");
|
||||
_compiled=true;
|
||||
}
|
||||
|
||||
@ -18,7 +18,7 @@
|
||||
#endif
|
||||
|
||||
__kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
|
||||
const __global numtyp *restrict x,
|
||||
const __global double *restrict x,
|
||||
const __global int *restrict type,
|
||||
const int nall) {
|
||||
int ii=GLOBAL_ID_X;
|
||||
|
||||
@ -52,6 +52,12 @@ using namespace ucl_cudadr;
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
|
||||
struct EllipsoidBonus {
|
||||
double shape[3];
|
||||
double quat[4];
|
||||
int ilocal;
|
||||
};
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class Atom {
|
||||
public:
|
||||
@ -306,8 +312,8 @@ class Atom {
|
||||
if (_x_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
#ifdef GPU_CAST
|
||||
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
|
||||
memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||
memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int));
|
||||
#else
|
||||
vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
|
||||
vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0]));
|
||||
@ -351,6 +357,24 @@ class Atom {
|
||||
add_x_data(host_ptr,host_type);
|
||||
}
|
||||
|
||||
// Cast mu data to write buffer (stored in quat)
|
||||
template<class cpytyp>
|
||||
inline void cast_mu_data(cpytyp *host_ptr) {
|
||||
if (_quat_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
|
||||
else
|
||||
#if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp parallel for simd schedule(static)
|
||||
#elif (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp simd
|
||||
#endif
|
||||
for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
}
|
||||
|
||||
// Cast charges to write buffer
|
||||
template<class cpytyp>
|
||||
inline void cast_q_data(cpytyp *host_ptr) {
|
||||
@ -384,22 +408,24 @@ class Atom {
|
||||
}
|
||||
|
||||
// Cast quaternions to write buffer
|
||||
template<class cpytyp>
|
||||
inline void cast_quat_data(cpytyp *host_ptr) {
|
||||
inline void cast_quat_data(const int *ellipsoid,
|
||||
const EllipsoidBonus *bonus) {
|
||||
if (_quat_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
if (_host_view) {
|
||||
quat.host.view((numtyp*)host_ptr,_nall*4,*dev);
|
||||
quat.device.view(quat.host);
|
||||
} else if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
|
||||
else
|
||||
#if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp parallel for simd schedule(static)
|
||||
#elif (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp simd
|
||||
#endif
|
||||
for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
|
||||
for (int i=0; i<_nall; i++) {
|
||||
int qi = ellipsoid[i];
|
||||
if (qi > -1) {
|
||||
quat[i*4] = bonus[qi].quat[0];
|
||||
quat[i*4+1] = bonus[qi].quat[1];
|
||||
quat[i*4+2] = bonus[qi].quat[2];
|
||||
quat[i*4+3] = bonus[qi].quat[3];
|
||||
}
|
||||
}
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
}
|
||||
@ -419,10 +445,6 @@ class Atom {
|
||||
inline void cast_v_data(double **host_ptr, const tagint *host_tag) {
|
||||
if (_v_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
#ifdef GPU_CAST
|
||||
memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||
memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
|
||||
#else
|
||||
vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
|
||||
vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0]));
|
||||
#if (LAL_USE_OMP == 1)
|
||||
@ -434,7 +456,6 @@ class Atom {
|
||||
vp[i].z=host_p[i].z;
|
||||
vp[i].w=host_tag[i];
|
||||
}
|
||||
#endif
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
}
|
||||
@ -444,16 +465,7 @@ class Atom {
|
||||
inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) {
|
||||
time_vel.start();
|
||||
if (_v_avail==false) {
|
||||
#ifdef GPU_CAST
|
||||
v_cast.update_device(_nall*3,true);
|
||||
tag_cast.update_device(_nall,true);
|
||||
int block_size=64;
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
|
||||
k_cast_x.set_size(GX,block_size);
|
||||
k_cast_x.run(&v, &v_cast, &tag_cast, &_nall);
|
||||
#else
|
||||
v.update_device(_nall*4,true);
|
||||
#endif
|
||||
_v_avail=true;
|
||||
}
|
||||
time_vel.stop();
|
||||
@ -519,7 +531,7 @@ class Atom {
|
||||
UCL_Vector<numtyp4,numtyp4> extra;
|
||||
|
||||
#ifdef GPU_CAST
|
||||
UCL_Vector<numtyp,numtyp> x_cast;
|
||||
UCL_Vector<double,double> x_cast;
|
||||
UCL_Vector<int,int> type_cast;
|
||||
#endif
|
||||
|
||||
|
||||
@ -143,10 +143,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
||||
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
|
||||
|
||||
_max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
|
||||
_tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
_tep.alloc(_max_tep_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
|
||||
_max_fieldp_size = _max_tep_size;
|
||||
_fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
_fieldp.alloc(_max_fieldp_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
|
||||
_max_thetai_size = 0;
|
||||
|
||||
@ -387,7 +387,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
|
||||
|
||||
if (inum_full>_max_tep_size) {
|
||||
_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
_tep.resize(_max_tep_size*4);
|
||||
_tep.resize(_max_tep_size*3);
|
||||
}
|
||||
*tep_ptr=_tep.host.begin();
|
||||
|
||||
@ -403,7 +403,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
|
||||
|
||||
// copy tep from device to host
|
||||
|
||||
_tep.update_host(_max_tep_size*4,false);
|
||||
_tep.update_host(_max_tep_size*3,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -429,7 +429,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double
|
||||
|
||||
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
|
||||
|
||||
_fieldp.update_host(_max_fieldp_size*8,false);
|
||||
_fieldp.update_host(_max_fieldp_size*6,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -456,7 +456,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
|
||||
// NOTE: move this step to update_fieldp() to delay device-host transfer
|
||||
// after umutual1 and self are done on the GPU
|
||||
// *fieldp_ptr=_fieldp.host.begin();
|
||||
// _fieldp.update_host(_max_fieldp_size*8,false);
|
||||
// _fieldp.update_host(_max_fieldp_size*6,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -732,7 +732,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
|
||||
device->add_ans_object(ans);
|
||||
|
||||
// copy tep from device to host
|
||||
_tep.update_host(_max_tep_size*4,false);
|
||||
_tep.update_host(_max_tep_size*3,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@ -233,7 +233,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
|
||||
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
atom->cast_q_data(host_q);
|
||||
atom->cast_quat_data(host_mu[0]);
|
||||
atom->cast_mu_data(host_mu[0]);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
atom->add_q_data();
|
||||
@ -297,12 +297,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
|
||||
if (!success)
|
||||
return nullptr;
|
||||
atom->cast_q_data(host_q);
|
||||
atom->cast_quat_data(host_mu[0]);
|
||||
atom->cast_mu_data(host_mu[0]);
|
||||
hd_balancer.start_timer();
|
||||
} else {
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
atom->cast_q_data(host_q);
|
||||
atom->cast_quat_data(host_mu[0]);
|
||||
atom->cast_mu_data(host_mu[0]);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
|
||||
@ -375,7 +375,8 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, double **host_quat) {
|
||||
bool &success, const int *ellipsoid,
|
||||
const EllipsoidBonus *bonus) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eflag_in) eflag=2;
|
||||
@ -409,7 +410,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
|
||||
list=ilist;
|
||||
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
atom->cast_quat_data(host_quat[0]);
|
||||
atom->cast_quat_data(ellipsoid,bonus);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
atom->add_quat_data();
|
||||
@ -433,7 +434,8 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_quat) {
|
||||
const int *ellipsoid,
|
||||
const EllipsoidBonus *bonus) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eflag_in) eflag=2;
|
||||
@ -460,11 +462,11 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full,
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return nullptr;
|
||||
atom->cast_quat_data(host_quat[0]);
|
||||
atom->cast_quat_data(ellipsoid,bonus);
|
||||
hd_balancer.start_timer();
|
||||
} else {
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
atom->cast_quat_data(host_quat[0]);
|
||||
atom->cast_quat_data(ellipsoid,bonus);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
|
||||
@ -170,7 +170,8 @@ class BaseEllipsoid {
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double **quat);
|
||||
const double cpu_time, bool &success,
|
||||
const int *ellipsoid, const EllipsoidBonus *bonus);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int**compute(const int ago, const int inum_full, const int nall,
|
||||
@ -179,7 +180,7 @@ class BaseEllipsoid {
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||
double **host_quat);
|
||||
const int *ellipsoid, const EllipsoidBonus *bonus);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
|
||||
@ -31,7 +31,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -47,7 +47,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -66,6 +66,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -130,7 +131,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -150,7 +151,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
||||
beck2[tid]=beck2_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -172,6 +173,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -32,7 +32,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -48,7 +48,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -67,6 +67,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -123,7 +124,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -144,7 +145,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -166,6 +167,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -36,7 +36,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -60,7 +60,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -80,6 +80,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -158,7 +159,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -183,7 +184,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -206,6 +207,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -51,7 +51,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -75,7 +75,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -95,6 +95,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -192,7 +193,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -217,7 +218,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -240,6 +241,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -38,7 +38,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -63,7 +63,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -89,6 +89,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -174,7 +175,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -200,7 +201,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -229,6 +230,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -39,7 +39,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -64,7 +64,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -90,6 +90,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -176,7 +177,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -202,7 +203,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -231,6 +232,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -31,7 +31,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -47,7 +47,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -66,6 +66,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -120,7 +121,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -141,7 +142,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -163,6 +164,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -36,7 +36,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -59,7 +59,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -79,6 +79,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -151,7 +152,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -177,7 +178,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -200,6 +201,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -36,7 +36,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -60,7 +60,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -80,6 +80,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -159,7 +160,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -185,7 +186,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -208,6 +209,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -34,7 +34,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -53,7 +53,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_bio();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -73,6 +73,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -159,7 +160,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -187,7 +188,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
|
||||
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
|
||||
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -209,6 +210,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -35,7 +35,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -50,7 +50,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_bio();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -70,6 +70,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -156,7 +157,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -181,7 +182,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
|
||||
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -203,6 +204,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -34,7 +34,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
||||
const __global int *form,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -50,7 +50,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -69,6 +69,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -188,7 +189,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
const __global int *form_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -215,7 +216,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -237,6 +238,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -35,7 +35,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -54,7 +54,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
|
||||
sp_cl[2]=sp_cl_in[2];
|
||||
sp_cl[3]=sp_cl_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -74,6 +74,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_coul;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_coul = sp_cl[sbmask(j)];
|
||||
@ -125,7 +126,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -146,7 +147,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
|
||||
cutsq[tid]=_cutsq[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -169,6 +170,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_coul;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_coul = sp_cl[sbmask(j)];
|
||||
|
||||
@ -35,7 +35,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -55,7 +55,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
|
||||
sp_cl[2]=sp_cl_in[2];
|
||||
sp_cl[3]=sp_cl_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -75,6 +75,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_coul;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_coul = sp_cl[sbmask(j)];
|
||||
@ -129,7 +130,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -153,7 +154,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
|
||||
cutsq[tid]=_cutsq[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -176,6 +177,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_coul;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_coul = sp_cl[sbmask(j)];
|
||||
|
||||
@ -36,7 +36,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -56,7 +56,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -81,6 +81,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_coul, r, prefactor, erfcc;
|
||||
@ -138,7 +139,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -156,7 +157,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
|
||||
if (tid<4)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -183,6 +184,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_coul, r, prefactor, erfcc;
|
||||
|
||||
@ -35,7 +35,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -54,7 +54,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
|
||||
sp_cl[2]=sp_cl_in[2];
|
||||
sp_cl[3]=sp_cl_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -73,6 +73,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_coul;
|
||||
@ -132,7 +133,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -152,7 +153,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
|
||||
scale[tid]=scale_in[tid];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -174,6 +175,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_coul;
|
||||
|
||||
@ -49,7 +49,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -68,7 +68,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
|
||||
sp_cl[2]=sp_cl_in[2];
|
||||
sp_cl[3]=sp_cl_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -87,6 +87,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_coul;
|
||||
@ -166,7 +167,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -186,7 +187,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
|
||||
scale[tid]=scale_in[tid];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -208,6 +209,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_coul;
|
||||
|
||||
@ -370,7 +370,7 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
|
||||
|
||||
_ocl_config_name="CUSTOM";
|
||||
int token_count=0;
|
||||
std::string params[18];
|
||||
std::string params[19];
|
||||
char ocl_config[2048];
|
||||
strncpy(ocl_config,s_config.c_str(),2047);
|
||||
char *pch = strtok(ocl_config,",");
|
||||
@ -378,7 +378,7 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
|
||||
pch = strtok(nullptr,",");
|
||||
if (pch == nullptr) return -11;
|
||||
while (pch != nullptr) {
|
||||
if (token_count==18)
|
||||
if (token_count==19)
|
||||
return -11;
|
||||
params[token_count]=pch;
|
||||
token_count++;
|
||||
@ -389,6 +389,16 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
|
||||
#ifdef CL_VERSION_2_0
|
||||
_ocl_compile_string+="-cl-std=CL2.0 ";
|
||||
#endif
|
||||
if (params[0]=="500") {
|
||||
_ocl_compile_string+="-DINTEL_OCL ";
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
// workaround for double precision with Intel OpenCL
|
||||
params[4]="0";
|
||||
#endif
|
||||
}
|
||||
#ifdef LAL_DISABLE_PREFETCH
|
||||
params[18]="0";
|
||||
#endif
|
||||
if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
|
||||
_ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
|
||||
std::string(OCL_PRECISION_COMPILE);
|
||||
@ -421,7 +431,8 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
|
||||
|
||||
" -DMAX_SHARED_TYPES="+params[15]+
|
||||
" -DMAX_BIO_SHARED_TYPES="+params[16]+
|
||||
" -DPPPM_MAX_SPLINE="+params[17];
|
||||
" -DPPPM_MAX_SPLINE="+params[17]+
|
||||
" -DNBOR_PREFETCH="+params[18];
|
||||
_ocl_compile_string += extra_args;
|
||||
#endif
|
||||
return 0;
|
||||
@ -558,7 +569,11 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
|
||||
return -3;
|
||||
|
||||
if (_user_cell_size<0.0) {
|
||||
#ifndef LAL_USE_OLD_NEIGHBOR
|
||||
_neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size());
|
||||
#else
|
||||
_neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size());
|
||||
#endif
|
||||
} else
|
||||
_neighbor_shared.setup_auto_cell_size(false,_user_cell_size,nbor->simd_size());
|
||||
nbor->set_cutoff(cutoff);
|
||||
@ -954,7 +969,7 @@ int DeviceT::compile_kernels() {
|
||||
k_info.set_function(*dev_program,"kernel_info");
|
||||
_compiled=true;
|
||||
|
||||
UCL_Vector<int,int> gpu_lib_data(19,*gpu,UCL_NOT_PINNED);
|
||||
UCL_Vector<int,int> gpu_lib_data(20,*gpu,UCL_NOT_PINNED);
|
||||
k_info.set_size(1,1);
|
||||
k_info.run(&gpu_lib_data);
|
||||
gpu_lib_data.update_host(false);
|
||||
|
||||
@ -52,4 +52,5 @@ __kernel void kernel_info(__global int *info) {
|
||||
info[16]=MAX_SHARED_TYPES;
|
||||
info[17]=MAX_BIO_SHARED_TYPES;
|
||||
info[18]=PPPM_MAX_SPLINE;
|
||||
info[19]=NBOR_PREFETCH;
|
||||
}
|
||||
|
||||
@ -211,7 +211,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -235,7 +235,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f, tor;
|
||||
acctyp3 f, tor;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
@ -257,6 +257,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -282,8 +283,8 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
|
||||
numtyp rinv, r3inv, r5inv, r7inv;
|
||||
numtyp pre1, pre2, pre3, pre4;
|
||||
numtyp pdotp, pidotr, pjdotr;
|
||||
acctyp4 forcecoul, ticoul;
|
||||
acctyp4 force;
|
||||
acctyp3 forcecoul, ticoul;
|
||||
acctyp3 force;
|
||||
|
||||
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||
@ -418,7 +419,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -445,7 +446,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f, tor;
|
||||
acctyp3 f, tor;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
@ -470,6 +471,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -494,8 +496,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
|
||||
numtyp rinv, r3inv, r5inv, r7inv;
|
||||
numtyp pre1, pre2, pre3, pre4;
|
||||
numtyp pdotp, pidotr, pjdotr;
|
||||
acctyp4 forcecoul, ticoul;
|
||||
acctyp4 force;
|
||||
acctyp3 forcecoul, ticoul;
|
||||
acctyp3 force;
|
||||
|
||||
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||
|
||||
@ -212,7 +212,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -236,7 +236,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f, tor;
|
||||
acctyp3 f, tor;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
@ -258,6 +258,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -286,8 +287,8 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
|
||||
numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
|
||||
numtyp4 aforcecoul, bforcecoul;
|
||||
|
||||
acctyp4 forcecoul, ticoul;
|
||||
acctyp4 force;
|
||||
acctyp3 forcecoul, ticoul;
|
||||
acctyp3 force;
|
||||
|
||||
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||
@ -450,7 +451,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -478,7 +479,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f, tor;
|
||||
acctyp3 f, tor;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
@ -503,6 +504,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -530,8 +532,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
|
||||
numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
|
||||
numtyp4 aforcecoul, bforcecoul;
|
||||
|
||||
acctyp4 forcecoul, ticoul;
|
||||
acctyp4 force;
|
||||
acctyp3 forcecoul, ticoul;
|
||||
acctyp3 force;
|
||||
|
||||
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||
|
||||
@ -213,7 +213,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -238,7 +238,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f, tor;
|
||||
acctyp3 f, tor;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
@ -264,6 +264,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -291,8 +292,8 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
|
||||
numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz;
|
||||
numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1;
|
||||
numtyp fdx,fdy,fdz,fax,fay,faz;
|
||||
acctyp4 forcecoul, ticoul;
|
||||
acctyp4 force;
|
||||
acctyp3 forcecoul, ticoul;
|
||||
acctyp3 force;
|
||||
|
||||
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||
@ -462,7 +463,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -490,7 +491,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f, tor;
|
||||
acctyp3 f, tor;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
@ -519,6 +520,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -545,8 +547,8 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
|
||||
numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz;
|
||||
numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1;
|
||||
numtyp fdx,fdy,fdz,fax,fay,faz;
|
||||
acctyp4 forcecoul, ticoul;
|
||||
acctyp4 force;
|
||||
acctyp3 forcecoul, ticoul;
|
||||
acctyp3 force;
|
||||
|
||||
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||
|
||||
@ -168,7 +168,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_sqrt,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -183,7 +183,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -203,6 +203,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_dpd, factor_sqrt;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_dpd = sp_lj[sbmask(j)];
|
||||
@ -284,7 +285,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_sqrt_in,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -318,7 +319,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -343,6 +344,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
|
||||
numtyp factor_dpd, factor_sqrt;
|
||||
#endif
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
#ifndef ONETYPE
|
||||
|
||||
@ -246,6 +246,7 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
|
||||
tfrho=type2frho[itype];
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
@ -332,6 +333,7 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
|
||||
#endif
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
@ -376,7 +378,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *cutsq,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *ans,
|
||||
__global acctyp3 *ans,
|
||||
__global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int ntypes,
|
||||
@ -388,7 +390,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_answers_eam();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -407,6 +409,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
@ -487,7 +490,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
|
||||
const __global numtyp *cutsq,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *ans,
|
||||
__global acctyp3 *ans,
|
||||
__global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const numtyp cutforcesq,
|
||||
@ -510,7 +513,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
|
||||
int n_stride;
|
||||
local_allocate_store_answers_eam();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -532,6 +535,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
|
||||
#endif
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
||||
@ -152,7 +152,7 @@ _texture_2d( quat_tex,int4);
|
||||
engv+=inum; \
|
||||
} \
|
||||
} \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -224,7 +224,7 @@ _texture_2d( quat_tex,int4);
|
||||
engv+=inum; \
|
||||
} \
|
||||
} \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
|
||||
@ -30,7 +30,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -40,7 +40,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -59,6 +59,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -109,7 +110,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -127,7 +128,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
|
||||
gauss1[tid]=gauss1_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -149,6 +150,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -80,6 +80,9 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
|
||||
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
|
||||
}
|
||||
|
||||
#ifdef INTEL_OCL
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
#endif
|
||||
__kernel void k_gayberne(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict q,
|
||||
const __global numtyp4 *restrict shape,
|
||||
@ -90,7 +93,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict lshape,
|
||||
const __global int *dev_nbor,
|
||||
const int stride,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
const int astride,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
@ -108,7 +111,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=gum[5];
|
||||
sp_lj[3]=gum[6];
|
||||
|
||||
acctyp4 f, tor;
|
||||
acctyp3 f, tor;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
@ -138,6 +141,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_nbor+nbor+n_stride);
|
||||
int j=dev_nbor[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
@ -108,28 +108,33 @@ int** compute(const int ago, const int inum_full, const int nall,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||
double **host_quat);
|
||||
const int *ellipsoid, const EllipsoidBonus *bonus);
|
||||
|
||||
int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial, tagint **special,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, int **ilist,
|
||||
int **jnum, const double cpu_time, bool &success,
|
||||
double **host_quat) {
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, const int *ellipsoid,
|
||||
const void *bonus) {
|
||||
return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
|
||||
tag, nspecial, special, eflag, vflag, eatom, vatom,
|
||||
host_start, ilist, jnum, cpu_time, success, host_quat);
|
||||
host_start, ilist, jnum, cpu_time, success,
|
||||
ellipsoid,
|
||||
static_cast<const EllipsoidBonus *>(bonus));
|
||||
}
|
||||
|
||||
int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double **host_quat) {
|
||||
const double cpu_time, bool &success,
|
||||
const int *ellipsoid, const void *bonus) {
|
||||
return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
|
||||
numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
|
||||
cpu_time, success, host_quat);
|
||||
cpu_time, success, ellipsoid,
|
||||
static_cast<const EllipsoidBonus *>(bonus));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@ -34,7 +34,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict lshape,
|
||||
const __global int *dev_nbor,
|
||||
const int stride,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
const int eflag, const int vflag,
|
||||
@ -53,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=gum[5];
|
||||
sp_lj[3]=gum[6];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -75,6 +75,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_nbor+nbor+n_stride);
|
||||
|
||||
int j=dev_nbor[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -259,7 +260,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict gum,
|
||||
const int stride,
|
||||
const __global int *dev_ij,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
const int eflag, const int vflag, const int start,
|
||||
@ -277,7 +278,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=gum[5];
|
||||
sp_lj[3]=gum[6];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -296,6 +297,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_ij+nbor+n_stride);
|
||||
|
||||
int j=dev_ij[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -347,7 +349,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict gum,
|
||||
const int stride,
|
||||
const __global int *dev_ij,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
const int eflag, const int vflag,
|
||||
@ -371,7 +373,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -393,6 +395,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_ij+nbor+n_stride);
|
||||
|
||||
int j=dev_ij[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -113,7 +113,7 @@ _texture( q_tex,int2);
|
||||
dufld[5]=red_acc[5][tid]; \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 t; \
|
||||
acctyp3 t; \
|
||||
t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \
|
||||
(numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \
|
||||
t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \
|
||||
@ -147,7 +147,7 @@ _texture( q_tex,int2);
|
||||
_fieldp[5]=red_acc[5][tid]; \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 f, fp; \
|
||||
acctyp3 f, fp; \
|
||||
f.x = _fieldp[0]; \
|
||||
f.y = _fieldp[1]; \
|
||||
f.z = _fieldp[2]; \
|
||||
@ -174,7 +174,7 @@ _texture( q_tex,int2);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -254,7 +254,7 @@ _texture( q_tex,int2);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 t; \
|
||||
acctyp3 t; \
|
||||
t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \
|
||||
(numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \
|
||||
t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \
|
||||
@ -277,7 +277,7 @@ _texture( q_tex,int2);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 f, fp; \
|
||||
acctyp3 f, fp; \
|
||||
f.x = _fieldp[0]; \
|
||||
f.y = _fieldp[1]; \
|
||||
f.z = _fieldp[2]; \
|
||||
@ -302,7 +302,7 @@ _texture( q_tex,int2);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -391,7 +391,7 @@ _texture( q_tex,int2);
|
||||
if (t_per_atom>1) \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -416,9 +416,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp4 *restrict tep,
|
||||
__global acctyp3 *restrict tep,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
const int t_per_atom, const numtyp aewald,
|
||||
@ -432,7 +432,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -441,7 +441,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
|
||||
for (int l=0; l<6; l++) virial[l]=(acctyp)0;
|
||||
}
|
||||
|
||||
acctyp4 tq;
|
||||
acctyp3 tq;
|
||||
tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
|
||||
|
||||
const __global numtyp4* polar1 = &extra[0];
|
||||
@ -717,7 +717,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
@ -730,7 +730,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -895,9 +895,9 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp4 *restrict tep,
|
||||
__global acctyp3 *restrict tep,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
const int t_per_atom, const numtyp aewald,
|
||||
@ -910,7 +910,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -919,7 +919,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
|
||||
for (int l=0; l<6; l++) virial[l]=(acctyp)0;
|
||||
}
|
||||
|
||||
acctyp4 tq;
|
||||
acctyp3 tq;
|
||||
tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
|
||||
|
||||
const __global numtyp4* polar1 = &extra[0];
|
||||
@ -1210,7 +1210,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_short_nbor,
|
||||
__global acctyp4 *restrict fieldp,
|
||||
__global acctyp3 *restrict fieldp,
|
||||
const int inum, const int nall,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
const numtyp aewald, const numtyp off2,
|
||||
@ -1390,7 +1390,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_short_nbor,
|
||||
__global acctyp4 *restrict fieldp,
|
||||
__global acctyp3 *restrict fieldp,
|
||||
const int inum, const int nall,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
const numtyp aewald, const numtyp off2,
|
||||
@ -1541,9 +1541,9 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp4 *restrict tep,
|
||||
__global acctyp3 *restrict tep,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch, const int t_per_atom,
|
||||
const numtyp aewald, const numtyp felec,
|
||||
@ -1556,7 +1556,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
|
||||
@ -31,7 +31,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -41,7 +41,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -59,6 +59,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -110,7 +111,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -144,7 +145,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -166,6 +167,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
NOUNROLL
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
#ifndef ONETYPE
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -31,7 +31,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -47,7 +47,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -66,6 +66,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -118,7 +119,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -139,7 +140,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -161,6 +162,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -36,7 +36,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -59,7 +59,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -79,6 +79,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -156,7 +157,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -182,7 +183,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -205,6 +206,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -36,7 +36,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -59,7 +59,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -79,6 +79,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -147,7 +148,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -173,7 +174,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -196,6 +197,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -36,7 +36,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -60,7 +60,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -80,6 +80,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -154,7 +155,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -181,7 +182,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -204,6 +205,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -36,7 +36,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -59,7 +59,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -79,6 +79,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -154,7 +155,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -178,7 +179,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -201,6 +202,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -94,7 +94,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -117,7 +117,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -139,6 +139,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
|
||||
numtyp cut_coul = ucl_sqrt(cut_coulsq);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -215,7 +216,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -239,7 +240,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -264,6 +265,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
|
||||
numtyp cut_coul = ucl_sqrt(cut_coulsq);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -39,7 +39,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -49,7 +49,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -67,6 +67,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -132,7 +133,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -155,7 +156,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -176,6 +177,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -38,7 +38,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -62,7 +62,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -88,6 +88,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul, r, prefactor, erfcc;
|
||||
@ -165,7 +166,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -190,7 +191,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -219,6 +220,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul, r, prefactor, erfcc;
|
||||
|
||||
@ -33,7 +33,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -49,7 +49,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -68,6 +68,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -122,7 +123,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -143,7 +144,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -165,6 +166,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -36,7 +36,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -59,7 +59,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -79,6 +79,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -158,7 +159,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -181,7 +182,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -204,6 +205,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -34,7 +34,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -50,7 +50,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -68,6 +68,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -134,7 +135,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -156,7 +157,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
|
||||
ljsw[tid]=ljsw_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -177,6 +178,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -33,7 +33,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -43,7 +43,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -63,6 +63,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
|
||||
numtyp r, t, tsq, fskin;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -135,7 +136,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -169,7 +170,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -194,6 +195,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
NOUNROLL
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
#ifndef ONETYPE
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -31,7 +31,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -47,7 +47,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -66,6 +66,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -128,7 +129,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -149,7 +150,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -171,6 +172,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -36,7 +36,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
@ -59,7 +59,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -79,6 +79,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
@ -166,7 +167,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -189,7 +190,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -212,6 +213,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
||||
@ -143,7 +143,7 @@ ucl_inline void compute_newsite(int iO, int iH1, int iH2,
|
||||
---------------------------------------------------------------------- */
|
||||
__kernel void k_lj_tip4p_long_distrib(
|
||||
const __global numtyp4 *restrict x_,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
@ -155,7 +155,7 @@ __kernel void k_lj_tip4p_long_distrib(
|
||||
const __global acctyp4 *restrict ansO) {
|
||||
|
||||
int i = BLOCK_ID_X*(BLOCK_SIZE_X)+THREAD_ID_X;
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
|
||||
if (i<inum) {
|
||||
@ -208,7 +208,7 @@ __kernel void k_lj_tip4p_long_distrib(
|
||||
engv[inum*engv_iter + i] += vM.z * (acctyp)(1 - alpha);
|
||||
}
|
||||
}
|
||||
acctyp4 old=ans[i];
|
||||
acctyp3 old=ans[i];
|
||||
old.x+=f.x;
|
||||
old.y+=f.y;
|
||||
old.z+=f.z;
|
||||
@ -325,7 +325,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
@ -344,7 +344,8 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
acctyp4 f, fO;
|
||||
acctyp3 f;
|
||||
acctyp4 fO;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6], vO[6];
|
||||
@ -386,6 +387,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj,factor_coul;
|
||||
@ -470,7 +472,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
}
|
||||
if (EVFLAG && vflag) {
|
||||
acctyp4 fd;
|
||||
acctyp3 fd;
|
||||
fd.x = delx*force_coul;
|
||||
fd.y = dely*force_coul;
|
||||
fd.z = delz*force_coul;
|
||||
@ -645,7 +647,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
@ -674,7 +676,8 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
|
||||
if (EVFLAG && eflag)
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
acctyp4 f, fO;
|
||||
acctyp3 f;
|
||||
acctyp4 fO;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6], vO[6];
|
||||
@ -717,6 +720,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj,factor_coul;
|
||||
@ -801,7 +805,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
}
|
||||
if (EVFLAG && vflag) {
|
||||
acctyp4 fd;
|
||||
acctyp3 fd;
|
||||
fd.x = delx*force_coul;
|
||||
fd.y = dely*force_coul;
|
||||
fd.z = delz*force_coul;
|
||||
|
||||
@ -31,7 +31,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -47,7 +47,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -66,6 +66,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -119,7 +120,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -139,7 +140,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
|
||||
mie3[tid]=mie3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -161,6 +162,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -33,7 +33,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -49,7 +49,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -68,6 +68,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -120,7 +121,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -141,7 +142,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
|
||||
mor2[tid]=mor2_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -163,6 +164,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -489,6 +489,10 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
|
||||
|
||||
#endif
|
||||
|
||||
#define SPECIAL_DATA_PRELOAD_SIZE 3
|
||||
#define UNROLL_FACTOR_LIST 4
|
||||
#define UNROLL_FACTOR_SPECIAL 2
|
||||
|
||||
__kernel void kernel_special(__global int *dev_nbor,
|
||||
__global int *host_nbor_list,
|
||||
const __global int *host_numj,
|
||||
@ -526,23 +530,68 @@ __kernel void kernel_special(__global int *dev_nbor,
|
||||
list_end=list+fast_mul(numj,stride);
|
||||
}
|
||||
|
||||
for ( ; list<list_end; list+=stride) {
|
||||
int nbor=*list;
|
||||
tagint jtag=tag[nbor];
|
||||
|
||||
int offset=ii;
|
||||
for (int i=0; i<n3; i++) {
|
||||
if (special[offset]==jtag) {
|
||||
int which = 1;
|
||||
if (i>=n1)
|
||||
which++;
|
||||
if (i>=n2)
|
||||
which++;
|
||||
nbor=nbor ^ (which << SBBITS);
|
||||
*list=nbor;
|
||||
#if SPECIAL_DATA_PRELOAD_SIZE > 0
|
||||
tagint special_preload[SPECIAL_DATA_PRELOAD_SIZE];
|
||||
for (int i = 0, j = 0; (i < n3) && (j < SPECIAL_DATA_PRELOAD_SIZE); i+=UNROLL_FACTOR_SPECIAL, j++) {
|
||||
special_preload[j] = special[ii + i*nt];
|
||||
}
|
||||
offset+=nt;
|
||||
#endif
|
||||
|
||||
for ( ; list<list_end; list+=UNROLL_FACTOR_LIST * stride) {
|
||||
int nbor[UNROLL_FACTOR_LIST];
|
||||
tagint jtag[UNROLL_FACTOR_LIST];
|
||||
__global int* list_addr[UNROLL_FACTOR_LIST];
|
||||
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
||||
list_addr[l] = list + l*stride;
|
||||
nbor[l] = *list_addr[l];
|
||||
}
|
||||
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
||||
jtag[l] = tag[nbor[l]];
|
||||
}
|
||||
|
||||
for (int i=0, j=0; i<n3; i+=UNROLL_FACTOR_SPECIAL, j++) {
|
||||
tagint special_data[UNROLL_FACTOR_SPECIAL];
|
||||
int which[UNROLL_FACTOR_SPECIAL];
|
||||
|
||||
for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
|
||||
which[c] = 1;
|
||||
if (i + c < n3)
|
||||
{
|
||||
#if SPECIAL_DATA_PRELOAD_SIZE > 0
|
||||
if ((c == 0) && (j < SPECIAL_DATA_PRELOAD_SIZE)) {
|
||||
special_data[c] = special_preload[j];
|
||||
}
|
||||
else
|
||||
#endif
|
||||
special_data[c] = special[ii + (i+c)*nt];
|
||||
}
|
||||
}
|
||||
|
||||
for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
|
||||
if (i+k >= n1) {
|
||||
which[k]++;
|
||||
}
|
||||
}
|
||||
for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
|
||||
if (i+k >= n2) {
|
||||
which[k]++;
|
||||
}
|
||||
which[k] <<= SBBITS;
|
||||
}
|
||||
for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
|
||||
if (i + c < n3) {
|
||||
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
||||
if (special_data[c] == jtag[l]) {
|
||||
nbor[l]=nbor[l] ^ which[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
||||
*list_addr[l] = nbor[l];
|
||||
}
|
||||
}
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
@ -217,7 +217,7 @@ __kernel void interp(const __global numtyp4 *restrict x_,
|
||||
const grdtyp delxinv, const grdtyp delyinv,
|
||||
const grdtyp delzinv, const int order,
|
||||
const int order2, const grdtyp qqrd2e_scale,
|
||||
__global acctyp4 *restrict ans) {
|
||||
__global acctyp3 *restrict ans) {
|
||||
__local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
|
||||
__local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
|
||||
__local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
|
||||
@ -239,7 +239,7 @@ __kernel void interp(const __global numtyp4 *restrict x_,
|
||||
fetch(qs,ii,q_tex);
|
||||
qs*=qqrd2e_scale;
|
||||
|
||||
acctyp4 ek;
|
||||
acctyp3 ek;
|
||||
ek.x=(acctyp)0.0;
|
||||
ek.y=(acctyp)0.0;
|
||||
ek.z=(acctyp)0.0;
|
||||
|
||||
@ -57,6 +57,7 @@
|
||||
#define MAX_SHARED_TYPES 11
|
||||
#define MAX_BIO_SHARED_TYPES 128
|
||||
#define PPPM_MAX_SPLINE 8
|
||||
#define NBOR_PREFETCH 0
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// KERNEL MACROS
|
||||
|
||||
@ -23,7 +23,7 @@
|
||||
// THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
|
||||
// BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
|
||||
// BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
|
||||
// PPPM_MAX_SPLINE}
|
||||
// PPPM_MAX_SPLINE, NBOR_PREFETCH}
|
||||
//
|
||||
//*************************************************************************/
|
||||
|
||||
@ -39,15 +39,15 @@ const char * ocl_config_names[] =
|
||||
};
|
||||
const char * ocl_config_strings[] =
|
||||
{
|
||||
"GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8",
|
||||
"NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8",
|
||||
"AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8",
|
||||
"GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8,0",
|
||||
"NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8,0",
|
||||
"AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8,0",
|
||||
#ifdef _SINGLE_SINGLE
|
||||
"INTEL_GPU,500,8,16,1,1,4,8,1,64,64,64,64,64,8,128,8,128,8",
|
||||
"APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8",
|
||||
"INTEL_GPU,500,8,32,1,1,4,8,2,128,128,128,128,64,8,128,8,128,8,2",
|
||||
"APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8,0",
|
||||
#else
|
||||
"INTEL_GPU,500,8,16,1,1,2,8,1,64,64,64,64,64,8,128,8,128,8",
|
||||
"APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8",
|
||||
"INTEL_GPU,500,8,32,1,1,2,8,2,128,128,128,128,64,8,128,8,128,8,2",
|
||||
"APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8,0",
|
||||
#endif
|
||||
"INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8"
|
||||
"INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8,0"
|
||||
};
|
||||
|
||||
@ -57,6 +57,10 @@ struct _lgpu_float2 {
|
||||
float x; float y;
|
||||
};
|
||||
|
||||
struct _lgpu_float3 {
|
||||
float x; float y; float z;
|
||||
};
|
||||
|
||||
struct _lgpu_float4 {
|
||||
float x; float y; float z; float w;
|
||||
};
|
||||
@ -65,6 +69,10 @@ struct _lgpu_double2 {
|
||||
double x; double y;
|
||||
};
|
||||
|
||||
struct _lgpu_double3 {
|
||||
double x; double y; double z;
|
||||
};
|
||||
|
||||
struct _lgpu_double4 {
|
||||
double x; double y; double z; double w;
|
||||
};
|
||||
@ -75,6 +83,11 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) {
|
||||
return out;
|
||||
}
|
||||
|
||||
inline std::ostream & operator<<(std::ostream &out, const _lgpu_float3 &v) {
|
||||
out << v.x << " " << v.y << " " << v.z;
|
||||
return out;
|
||||
}
|
||||
|
||||
inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) {
|
||||
out << v.x << " " << v.y << " " << v.z;
|
||||
return out;
|
||||
@ -85,6 +98,11 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) {
|
||||
return out;
|
||||
}
|
||||
|
||||
inline std::ostream & operator<<(std::ostream &out, const _lgpu_double3 &v) {
|
||||
out << v.x << " " << v.y << " " << v.z;
|
||||
return out;
|
||||
}
|
||||
|
||||
inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
|
||||
out << v.x << " " << v.y << " " << v.z;
|
||||
return out;
|
||||
@ -97,8 +115,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
|
||||
#define PRECISION float
|
||||
#define ACC_PRECISION double
|
||||
#define numtyp2 _lgpu_float2
|
||||
#define numtyp3 _lgpu_float3
|
||||
#define numtyp4 _lgpu_float4
|
||||
#define acctyp2 _lgpu_double2
|
||||
#define acctyp3 _lgpu_double3
|
||||
#define acctyp4 _lgpu_double4
|
||||
#endif
|
||||
|
||||
@ -107,8 +127,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
|
||||
#define PRECISION double
|
||||
#define ACC_PRECISION double
|
||||
#define numtyp2 _lgpu_double2
|
||||
#define numtyp3 _lgpu_double3
|
||||
#define numtyp4 _lgpu_double4
|
||||
#define acctyp2 _lgpu_double2
|
||||
#define acctyp3 _lgpu_double3
|
||||
#define acctyp4 _lgpu_double4
|
||||
#endif
|
||||
|
||||
@ -117,8 +139,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
|
||||
#define PRECISION float
|
||||
#define ACC_PRECISION float
|
||||
#define numtyp2 _lgpu_float2
|
||||
#define numtyp3 _lgpu_float3
|
||||
#define numtyp4 _lgpu_float4
|
||||
#define acctyp2 _lgpu_float2
|
||||
#define acctyp3 _lgpu_float3
|
||||
#define acctyp4 _lgpu_float4
|
||||
#endif
|
||||
|
||||
|
||||
@ -93,6 +93,13 @@
|
||||
// Definition: Maximum order for splines in PPPM
|
||||
// Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
|
||||
//
|
||||
// NBOR_PREFETCH
|
||||
// Definition: Control use of prefetch for neighbor indices
|
||||
// 0 = No prefetch
|
||||
// 1 = Prefetch using standard API
|
||||
// 2 = Prefetch using Intel intrinsics
|
||||
// Restrictions: NBOR_PREFETCH forced to 0 when LAL_DISABLE_PREFETCH
|
||||
// is defined in library build
|
||||
//*************************************************************************/
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@ -101,6 +108,7 @@
|
||||
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_pre_cuda_hip.h"
|
||||
#define ucl_prefetch(p)
|
||||
#define ucl_pow pow
|
||||
#endif
|
||||
|
||||
@ -169,7 +177,7 @@
|
||||
#define ucl_abs fabs
|
||||
#define ucl_erfc erfc
|
||||
|
||||
#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
|
||||
#if (FAST_MATH > 0) && !defined(_DOUBLE_DOUBLE)
|
||||
|
||||
#define ucl_exp native_exp
|
||||
#define ucl_pow pow
|
||||
@ -285,6 +293,55 @@
|
||||
#define simd_size() SIMD_SIZE
|
||||
#endif
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// OPENCL KERNEL MACROS - PREFETCH
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
#if (NBOR_PREFETCH == 0)
|
||||
#define ucl_prefetch(p)
|
||||
#endif
|
||||
|
||||
#if (NBOR_PREFETCH == 1)
|
||||
inline void ucl_prefetch(const __global int *p) {
|
||||
prefetch(p, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (NBOR_PREFETCH == 2)
|
||||
// Load message caching control
|
||||
enum LSC_LDCC {
|
||||
LSC_LDCC_DEFAULT,
|
||||
LSC_LDCC_L1UC_L3UC, //1 Override to L1 uncached and L3 uncached
|
||||
LSC_LDCC_L1UC_L3C, //1 Override to L1 uncached and L3 cached
|
||||
LSC_LDCC_L1C_L3UC, //1 Override to L1 cached and L3 uncached
|
||||
LSC_LDCC_L1C_L3C, //1 Override to L1 cached and L3 cached
|
||||
LSC_LDCC_L1S_L3UC, //1 Override to L1 streaming load and L3 uncached
|
||||
LSC_LDCC_L1S_L3C, //1 Override to L1 streaming load and L3 cached
|
||||
LSC_LDCC_L1IAR_L3C, //1 Override to L1 invalidate-after-read, and L3 cached
|
||||
};
|
||||
|
||||
void __builtin_IB_lsc_prefetch_global_uint(const __global uint *base,
|
||||
int elemOff,
|
||||
enum LSC_LDCC cacheOpt); //D32V1
|
||||
|
||||
inline void ucl_prefetch(const __global int *p) {
|
||||
__builtin_IB_lsc_prefetch_global_uint((const __global uint *)p, 0,
|
||||
LSC_LDCC_L1C_L3UC);
|
||||
}
|
||||
#endif
|
||||
|
||||
struct _lgpu_float3 {
|
||||
float x; float y; float z;
|
||||
};
|
||||
struct _lgpu_double3 {
|
||||
double x; double y; double z;
|
||||
};
|
||||
#ifdef _SINGLE_SINGLE
|
||||
#define acctyp3 struct _lgpu_float3
|
||||
#else
|
||||
#define acctyp3 struct _lgpu_double3
|
||||
#endif
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// END OPENCL DEFINITIONS
|
||||
// -------------------------------------------------------------------------
|
||||
@ -301,6 +358,9 @@
|
||||
#define numtyp4 double4
|
||||
#define acctyp double
|
||||
#define acctyp2 double2
|
||||
#ifndef acctyp3
|
||||
#define acctyp3 double3
|
||||
#endif
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
@ -310,6 +370,9 @@
|
||||
#define numtyp4 float4
|
||||
#define acctyp double
|
||||
#define acctyp2 double2
|
||||
#ifndef acctyp3
|
||||
#define acctyp3 double3
|
||||
#endif
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
@ -319,6 +382,9 @@
|
||||
#define numtyp4 float4
|
||||
#define acctyp float
|
||||
#define acctyp2 float2
|
||||
#ifndef acctyp3
|
||||
#define acctyp3 float3
|
||||
#endif
|
||||
#define acctyp4 float4
|
||||
#endif
|
||||
|
||||
|
||||
@ -32,6 +32,9 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
|
||||
return ans;
|
||||
}
|
||||
|
||||
#ifdef INTEL_OCL
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
#endif
|
||||
__kernel void k_resquared(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict q,
|
||||
const __global numtyp4 *restrict shape,
|
||||
@ -41,7 +44,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
|
||||
const int ntypes,
|
||||
const __global int *dev_nbor,
|
||||
const int stride,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
const int astride,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
@ -62,7 +65,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
|
||||
const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0;
|
||||
const numtyp cr60=ucl_cbrt((numtyp)60.0);
|
||||
|
||||
acctyp4 f, tor;
|
||||
acctyp3 f, tor;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
@ -122,6 +125,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_nbor+nbor+n_stride);
|
||||
int j=dev_nbor[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
@ -105,28 +105,32 @@ void re_gpu_clear() {
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||
double **host_quat);
|
||||
const int *ellipsoid, const EllipsoidBonus *bonus);
|
||||
|
||||
int** re_gpu_compute_n(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial, tagint **special,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, int **ilist,
|
||||
int **jnum, const double cpu_time, bool &success,
|
||||
double **host_quat) {
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, const int *ellipsoid,
|
||||
const void *bonus) {
|
||||
return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
|
||||
tag, nspecial, special, eflag, vflag, eatom, vatom,
|
||||
host_start, ilist, jnum, cpu_time, success, host_quat);
|
||||
host_start, ilist, jnum, cpu_time, success, ellipsoid,
|
||||
static_cast<const EllipsoidBonus *>(bonus));
|
||||
}
|
||||
|
||||
int * re_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double **host_quat) {
|
||||
const double cpu_time, bool &success,
|
||||
const int *ellipsoid, const void *bonus) {
|
||||
return REMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
|
||||
numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
|
||||
cpu_time, success, host_quat);
|
||||
cpu_time, success, ellipsoid,
|
||||
static_cast<const EllipsoidBonus *>(bonus));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -135,4 +139,3 @@ int * re_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double re_gpu_bytes() {
|
||||
return REMF.host_memory_usage();
|
||||
}
|
||||
|
||||
|
||||
@ -86,7 +86,7 @@
|
||||
ap1+=astride; \
|
||||
} \
|
||||
} \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -131,7 +131,7 @@
|
||||
ap1+=astride; \
|
||||
} \
|
||||
} \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -154,7 +154,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
|
||||
const int ntypes,
|
||||
const __global int *dev_nbor,
|
||||
const int stride,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
const int astride,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
@ -180,7 +180,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
|
||||
const numtyp solv_f_r =
|
||||
(numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
|
||||
|
||||
acctyp4 f, tor;
|
||||
acctyp3 f, tor;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
@ -216,6 +216,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_nbor+nbor+n_stride);
|
||||
int j=dev_nbor[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
@ -409,7 +410,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
||||
const int ntypes,
|
||||
const __global int *dev_nbor,
|
||||
const int stride,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
const int eflag, const int vflag,
|
||||
@ -435,7 +436,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
||||
const numtyp solv_f_r =
|
||||
(numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -454,6 +455,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_nbor+nbor+n_stride);
|
||||
int i=dev_nbor[nbor];
|
||||
factor_lj = sp_lj[sbmask(i)];
|
||||
i &= NEIGHMASK;
|
||||
@ -610,7 +612,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict gum,
|
||||
const int stride,
|
||||
const __global int *dev_ij,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
const int eflag, const int vflag, const int start,
|
||||
@ -628,7 +630,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=gum[2];
|
||||
sp_lj[3]=gum[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -647,6 +649,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_ij+nbor+n_stride);
|
||||
|
||||
int j=dev_ij[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -697,7 +700,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict gum,
|
||||
const int stride,
|
||||
const __global int *dev_ij,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
const int eflag, const int vflag,
|
||||
@ -721,7 +724,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -743,6 +746,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_ij+nbor+n_stride);
|
||||
|
||||
int j=dev_ij[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -32,7 +32,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -48,7 +48,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -67,6 +67,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -119,7 +120,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -137,7 +138,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
|
||||
coeff[tid]=coeff_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -159,6 +160,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -57,7 +57,7 @@ _texture( sw3_tex,int4);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -116,7 +116,7 @@ _texture( sw3_tex,int4);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -194,7 +194,7 @@ _texture( sw3_tex,int4);
|
||||
if (t_per_atom>1) \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -265,7 +265,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 * restrict c_14,
|
||||
const __global numtyp2 * restrict c_56,
|
||||
const int ntypes, const __global int * dev_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
@ -282,7 +282,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
if (EVFLAG && eflag) pre_sw_c56=c_56[ONETYPE];
|
||||
#endif
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -461,7 +461,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp2 *restrict sw_pre3,
|
||||
const int ntypes,
|
||||
const __global int * dev_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -480,7 +480,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
|
||||
#endif
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -579,7 +579,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp2 *restrict sw_pre3,
|
||||
const int ntypes, const __global int * dev_nbor,
|
||||
const __global int * dev_ilist,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -598,7 +598,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
|
||||
#endif
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -701,7 +701,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp2 *restrict sw_pre3,
|
||||
const int ntypes, const __global int * dev_nbor,
|
||||
const __global int * dev_ilist,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -720,7 +720,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
|
||||
#endif
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
|
||||
@ -49,7 +49,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
@ -66,7 +66,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -87,6 +87,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -146,7 +147,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
@ -165,7 +166,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
|
||||
cutsq[tid]=cutsq_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -189,6 +190,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -251,7 +253,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
@ -268,7 +270,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -289,6 +291,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -352,7 +355,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -371,7 +374,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
|
||||
cutsq[tid]=cutsq_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -395,6 +398,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -461,7 +465,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
@ -478,7 +482,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -499,6 +503,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -569,7 +574,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
|
||||
const __global numtyp* sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *ans,
|
||||
__global acctyp3 *ans,
|
||||
__global acctyp *engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -588,7 +593,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
|
||||
cutsq[tid]=cutsq_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -611,6 +616,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -686,7 +692,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
|
||||
const __global numtyp* sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *ans,
|
||||
__global acctyp3 *ans,
|
||||
__global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
@ -703,7 +709,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -724,6 +730,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -792,7 +799,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
|
||||
const __global numtyp* sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *ans,
|
||||
__global acctyp3 *ans,
|
||||
__global acctyp *engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -811,7 +818,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
|
||||
cutsq[tid]=cutsq_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -835,6 +842,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -63,7 +63,7 @@ _texture_2d( pos_tex,int4);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -132,7 +132,7 @@ _texture_2d( pos_tex,int4);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -211,7 +211,7 @@ _texture_2d( pos_tex,int4);
|
||||
if (t_per_atom>1) \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -448,7 +448,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -472,7 +472,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
||||
const numtyp ijparam_bigd = ts2_in[ONETYPE3].w;
|
||||
#endif
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -553,7 +553,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp2 *restrict zetaij,
|
||||
const __global acctyp *restrict zetaij_e,
|
||||
const __global int * dev_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -585,7 +585,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
||||
const numtyp gamma = ts4_in[ONETYPE3].w;
|
||||
#endif
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -728,7 +728,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp *restrict zetaij_e,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_ilist,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -760,7 +760,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
const numtyp gamma = ts4_in[ONETYPE3].w;
|
||||
#endif
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -950,7 +950,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp *restrict zetaij_e,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_ilist,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -982,7 +982,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const numtyp gamma = ts4_in[ONETYPE3].w;
|
||||
#endif
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
|
||||
@ -63,7 +63,7 @@ _texture_2d( pos_tex,int4);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -131,7 +131,7 @@ _texture_2d( pos_tex,int4);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -209,7 +209,7 @@ _texture_2d( pos_tex,int4);
|
||||
if (t_per_atom>1) \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -417,7 +417,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -434,7 +434,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
||||
ts2[tid]=ts2_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -511,7 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
||||
const int nelements, const int nparams,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -535,7 +535,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
||||
ts5[tid]=ts5_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -676,7 +676,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_ilist,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -700,7 +700,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
ts5[tid]=ts5_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -890,7 +890,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_ilist,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -914,7 +914,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
ts5[tid]=ts5_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
|
||||
@ -81,7 +81,7 @@ _texture( ts6_tex,int4);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -149,7 +149,7 @@ _texture( ts6_tex,int4);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -227,7 +227,7 @@ _texture( ts6_tex,int4);
|
||||
if (t_per_atom>1) \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -443,7 +443,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -462,7 +462,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
||||
ts6[tid]=ts6_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -544,7 +544,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
||||
const int nelements, const int nparams,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -566,7 +566,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
||||
ts4[tid]=ts4_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -703,7 +703,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_ilist,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -725,7 +725,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
ts4[tid]=ts4_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -908,7 +908,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_ilist,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -930,7 +930,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
ts4[tid]=ts4_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
|
||||
@ -33,7 +33,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -43,7 +43,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -61,6 +61,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -109,7 +110,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -130,7 +131,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
|
||||
uf3[tid]=uf3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -151,6 +152,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -73,7 +73,7 @@ _texture( param5_tex,int4);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -132,7 +132,7 @@ _texture( param5_tex,int4);
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -210,7 +210,7 @@ _texture( param5_tex,int4);
|
||||
if (t_per_atom>1) \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (offset==0 && ii<inum) { \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -247,6 +247,7 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
ucl_prefetch(dev_packed+nbor+nbor_pitch);
|
||||
int sj=dev_packed[nbor];
|
||||
int j = sj & NEIGHMASK;
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -283,7 +284,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int ev_stride) {
|
||||
@ -291,7 +292,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
||||
|
||||
local_allocate_store_pair();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -313,6 +314,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
||||
itype=map[itype];
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
ucl_prefetch(dev_packed+nbor+nbor_pitch);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
j &= NEIGHMASK;
|
||||
@ -489,7 +491,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -504,7 +506,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
||||
|
||||
local_allocate_store_three();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -612,7 +614,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
||||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_ilist,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -627,7 +629,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
local_allocate_store_three();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -743,7 +745,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_ilist,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -758,7 +760,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
local_allocate_store_three();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
|
||||
@ -30,7 +30,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -46,7 +46,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -65,6 +65,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -118,7 +119,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -136,7 +137,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
|
||||
coeff[tid]=coeff_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -158,6 +159,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -40,7 +40,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
@ -57,7 +57,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -77,6 +77,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -131,7 +132,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
@ -150,7 +151,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
coeff[tid]=coeff_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -173,6 +174,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
@ -88,7 +88,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
|
||||
const int lj_types,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -98,7 +98,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -116,6 +116,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
j &= NEIGHMASK;
|
||||
@ -179,7 +180,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
|
||||
const numtyp cut_inner,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
@ -198,7 +199,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
|
||||
coeff3[tid]=coeff3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -219,6 +220,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
@ -290,6 +290,20 @@ void FixGPU::init()
|
||||
|
||||
void FixGPU::setup(int vflag)
|
||||
{
|
||||
// See if we should overlap topology list builds on CPU with work on GPU
|
||||
int overlap_topo = 0;
|
||||
if ((atom->molecular != Atom::ATOMIC)) {
|
||||
PairHybrid *ph = reinterpret_cast<PairHybrid *>(force->pair_match("^hybrid",0));
|
||||
if (ph) {
|
||||
for (int isub=0; isub < ph->nstyles; ++isub) {
|
||||
if (force->pair_match("gpu",0,isub)) overlap_topo = 1;
|
||||
}
|
||||
} else {
|
||||
if (force->pair_match("gpu",0)) overlap_topo = 1;
|
||||
}
|
||||
}
|
||||
if (overlap_topo) neighbor->set_overlap_topo(1);
|
||||
|
||||
if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
|
||||
if (neighbor->exclude_setting() != 0)
|
||||
error->all(FLERR, "Cannot use neigh_modify exclude with GPU neighbor builds");
|
||||
|
||||
@ -243,13 +243,8 @@ void FixNVEAsphereGPU::initial_integrate(int /*vflag*/)
|
||||
// update angular momentum by 1/2 step
|
||||
if (igroup == 0) {
|
||||
#if (LAL_USE_OMP_SIMD == 1)
|
||||
// Workaround for compiler bug
|
||||
#ifdef __INTEL_COMPILER
|
||||
#pragma simd
|
||||
#else
|
||||
#pragma omp simd
|
||||
#endif
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
double *quat = bonus[ellipsoid[i]].quat;
|
||||
ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
|
||||
@ -257,13 +252,8 @@ void FixNVEAsphereGPU::initial_integrate(int /*vflag*/)
|
||||
}
|
||||
} else {
|
||||
#if (LAL_USE_OMP_SIMD == 1)
|
||||
// Workaround for compiler bug
|
||||
#ifdef __INTEL_COMPILER
|
||||
#pragma simd
|
||||
#else
|
||||
#pragma omp simd
|
||||
#endif
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
if (mask[i] & groupbit) {
|
||||
double *quat = bonus[ellipsoid[i]].quat;
|
||||
|
||||
@ -155,6 +155,15 @@ PairAmoebaGPU::~PairAmoebaGPU()
|
||||
amoeba_gpu_clear();
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void PairAmoebaGPU::compute(int eflag, int vflag)
|
||||
{
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
PairAmoeba::compute(eflag, vflag);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
init specific to this pair style
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
@ -28,6 +28,7 @@ class PairAmoebaGPU : public PairAmoeba {
|
||||
public:
|
||||
PairAmoebaGPU(LAMMPS *lmp);
|
||||
~PairAmoebaGPU() override;
|
||||
void compute(int, int) override;
|
||||
void init_style() override;
|
||||
double memory_usage() override;
|
||||
|
||||
|
||||
@ -109,6 +109,8 @@ void PairBeckGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -129,6 +129,8 @@ void PairBornCoulLongCSGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -123,6 +123,8 @@ void PairBornCoulLongGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -117,6 +117,8 @@ void PairBornCoulWolfCSGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -114,6 +114,8 @@ void PairBornCoulWolfGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -109,6 +109,8 @@ void PairBornGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -111,6 +111,8 @@ void PairBuckCoulCutGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -120,6 +120,8 @@ void PairBuckCoulLongGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -107,6 +107,8 @@ void PairBuckGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -109,6 +109,8 @@ void PairColloidGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -108,6 +108,8 @@ void PairCoulCutGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -109,6 +109,8 @@ void PairCoulDebyeGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -118,6 +118,8 @@ void PairCoulDSFGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
@ -123,6 +123,8 @@ void PairCoulLongCSGPU::compute(int eflag, int vflag)
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user