Merge branch 'lammps:master' into smooth-gpu
This commit is contained in:
@ -24,10 +24,6 @@
|
||||
#ifndef NVD_DEVICE
|
||||
#define NVD_DEVICE
|
||||
|
||||
// workaround after GPU package Feb2021 update
|
||||
// todo: make new neighbor code work with CUDA
|
||||
#define LAL_USE_OLD_NEIGHBOR
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
@ -108,17 +108,14 @@ class UCL_Program {
|
||||
std::cerr << log << std::endl
|
||||
<< "----------------------------------------------------------\n\n";
|
||||
#endif
|
||||
if (foutput != NULL) {
|
||||
fprintf(foutput,"\n\n");
|
||||
fprintf(foutput,
|
||||
"----------------------------------------------------------\n");
|
||||
fprintf(foutput," UCL Error: Error compiling PTX Program...\n");
|
||||
fprintf(foutput,
|
||||
"----------------------------------------------------------\n");
|
||||
fprintf(foutput,"%s\n",log);
|
||||
fprintf(foutput,
|
||||
"----------------------------------------------------------\n");
|
||||
fprintf(foutput,"\n\n");
|
||||
if (foutput != nullptr) {
|
||||
fprintf(foutput,"\n\n");
|
||||
fprintf(foutput, "----------------------------------------------------------\n");
|
||||
fprintf(foutput, " UCL Error: Error compiling PTX Program...\n");
|
||||
fprintf(foutput, "----------------------------------------------------------\n");
|
||||
fprintf(foutput, "%s\n",log->c_str());
|
||||
fprintf(foutput, "----------------------------------------------------------\n");
|
||||
fprintf(foutput,"\n\n");
|
||||
}
|
||||
return UCL_COMPILE_ERROR;
|
||||
}
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
#include <iostream>
|
||||
|
||||
#ifndef CL_TARGET_OPENCL_VERSION
|
||||
#define CL_TARGET_OPENCL_VERSION 210
|
||||
#define CL_TARGET_OPENCL_VERSION 300
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
@ -728,6 +728,9 @@ void UCL_Device::print_all(std::ostream &out) {
|
||||
out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
|
||||
out << " Type of device: "
|
||||
<< device_type_name(i).c_str() << std::endl;
|
||||
out << " Supported OpenCL Version: "
|
||||
<< _properties[i].cl_device_version / 100 << "."
|
||||
<< _properties[i].cl_device_version % 100 << std::endl;
|
||||
out << " Is a subdevice: ";
|
||||
if (is_subdevice(i))
|
||||
out << "Yes\n";
|
||||
@ -796,6 +799,16 @@ void UCL_Device::print_all(std::ostream &out) {
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Subgroup support: ";
|
||||
if (_properties[i].has_subgroup_support)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Shuffle support: ";
|
||||
if (_properties[i].has_shuffle_support)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
#include <cassert>
|
||||
|
||||
#ifndef CL_TARGET_OPENCL_VERSION
|
||||
#define CL_TARGET_OPENCL_VERSION 210
|
||||
#define CL_TARGET_OPENCL_VERSION 300
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
||||
@ -335,7 +335,7 @@ void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.cl_device_version() >= 210) {
|
||||
if (dev.has_subgroup_support()) {
|
||||
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
|
||||
|
||||
@ -348,7 +348,7 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.cl_device_version() >= 210) {
|
||||
if (dev.has_subgroup_support()) {
|
||||
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
|
||||
|
||||
@ -356,7 +356,7 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.cl_device_version() >= 210) {
|
||||
if (dev.has_subgroup_support()) {
|
||||
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
|
||||
|
||||
@ -356,7 +356,7 @@ void BaseDPDT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.cl_device_version() >= 210) {
|
||||
if (dev.has_subgroup_support()) {
|
||||
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
|
||||
|
||||
@ -554,7 +554,7 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.cl_device_version() >= 210) {
|
||||
if (dev.has_subgroup_support()) {
|
||||
size_t mx_subgroup_sz = k_lj_fast.max_subgroup_size(_block_size);
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid.max_subgroup_size(_block_size));
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid.max_subgroup_size(_block_size));
|
||||
|
||||
@ -461,7 +461,7 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.cl_device_version() >= 210) {
|
||||
if (dev.has_subgroup_support()) {
|
||||
size_t mx_subgroup_sz = k_pair.max_subgroup_size(_block_size);
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center.max_subgroup_size(_block_size));
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end.max_subgroup_size(_block_size));
|
||||
|
||||
@ -150,7 +150,7 @@ int CHARMMT::loop(const int eflag, const int vflag) {
|
||||
&_cut_coul_innersq, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &ljd, &sp_lj,
|
||||
this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, this->_nbor_data,
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
|
||||
@ -29,7 +29,8 @@ _texture(q_tex, int2);
|
||||
#endif
|
||||
|
||||
__kernel void k_charmm(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp2 *restrict ljd,
|
||||
const __global numtyp4 *restrict lj1,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
@ -88,20 +89,14 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (rsq<cut_bothsq) {
|
||||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp forcecoul, force_lj, force, switch1;
|
||||
numtyp lj3, lj4;
|
||||
numtyp forcecoul, force_lj, force, r6inv, switch1;
|
||||
|
||||
if (rsq < cut_ljsq) {
|
||||
numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
|
||||
numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
|
||||
|
||||
numtyp sig_r_6 = sig6*sig6*r2inv;
|
||||
sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
|
||||
lj4 = (numtyp)4.0*eps*sig_r_6;
|
||||
lj3 = lj4*sig_r_6;
|
||||
force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
if (rsq > cut_lj_innersq) {
|
||||
switch1 = (cut_ljsq-rsq);
|
||||
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)*
|
||||
@ -109,7 +104,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
|
||||
switch1 *= switch1;
|
||||
switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)*
|
||||
denom_lj;
|
||||
switch2 *= lj3-lj4;
|
||||
switch2 *= r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
|
||||
force_lj = force_lj*switch1+switch2;
|
||||
}
|
||||
} else
|
||||
@ -137,7 +132,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
|
||||
if (EVFLAG && eflag) {
|
||||
e_coul += forcecoul;
|
||||
if (rsq < cut_ljsq) {
|
||||
numtyp e=lj3-lj4;
|
||||
numtyp e=r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
|
||||
if (rsq > cut_lj_innersq)
|
||||
e *= switch1;
|
||||
energy+=factor_lj*e;
|
||||
|
||||
@ -777,6 +777,7 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
|
||||
#ifdef USE_OPENCL
|
||||
// Workaround for timing issue on Intel OpenCL
|
||||
if (times[3] > 80e6) times[3]=0.0;
|
||||
if (times[5] > 80e6) times[5]=0.0;
|
||||
#endif
|
||||
|
||||
if (replica_me()==0)
|
||||
@ -1061,7 +1062,7 @@ bool lmp_gpu_config(const std::string &category, const std::string &setting)
|
||||
return setting == "opencl";
|
||||
#elif defined(USE_HIP)
|
||||
return setting == "hip";
|
||||
#elif defined(USE_CUDA)
|
||||
#elif defined(USE_CUDA) || defined(USE_CUDART)
|
||||
return setting == "cuda";
|
||||
#endif
|
||||
return false;
|
||||
|
||||
@ -225,7 +225,7 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
|
||||
const numtyp rdr, const numtyp rdrho,
|
||||
const numtyp rhomax, const int nrho,
|
||||
const int nr, const int t_per_atom) {
|
||||
int tid, ii, offset, i, itype;
|
||||
int tid, ii, offset, i, itype, tfrho;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
int n_stride;
|
||||
@ -242,6 +242,7 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
itype=ix.w;
|
||||
tfrho=type2frho[itype];
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
@ -270,7 +271,6 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
} // for nbor
|
||||
} // if ii
|
||||
const numtyp tfrho=type2frho[itype];
|
||||
store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
|
||||
eflag,vflag,engv,rdrho,nrho,i,rhomax,tfrho);
|
||||
}
|
||||
@ -291,7 +291,7 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
|
||||
const numtyp rdrho, const numtyp rhomax,
|
||||
const int nrho, const int nr,
|
||||
const int t_per_atom) {
|
||||
int tid, ii, offset, i, itype;
|
||||
int tid, ii, offset, i, itype, tfrho;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
#ifndef ONETYPE
|
||||
@ -305,9 +305,9 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
__syncthreads();
|
||||
#else
|
||||
const numtyp type2rhor_z2rx=
|
||||
const int type2rhor_z2rx=
|
||||
type2rhor_z2r_in[ONETYPE*MAX_SHARED_TYPES+ONETYPE].x;
|
||||
const numtyp tfrho=type2frho_in[ONETYPE];
|
||||
tfrho=type2frho_in[ONETYPE];
|
||||
#endif
|
||||
|
||||
int n_stride;
|
||||
@ -325,6 +325,7 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
#ifndef ONETYPE
|
||||
itype=ix.w;
|
||||
tfrho=type2frho[itype];
|
||||
#endif
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
@ -347,7 +348,7 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
|
||||
p = MIN(p,(numtyp)1.0);
|
||||
|
||||
#ifndef ONETYPE
|
||||
int jtype=fast_mul((int)MAX_SHARED_TYPES,jx.w);
|
||||
int jtype = fast_mul((int)MAX_SHARED_TYPES,jx.w);
|
||||
int mtype = jtype+itype;
|
||||
int index = type2rhor_z2r[mtype].x*(nr+1)+m;
|
||||
#else
|
||||
@ -358,9 +359,6 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
} // for nbor
|
||||
} // if ii
|
||||
#ifndef ONETYPE
|
||||
const numtyp tfrho=type2frho[itype];
|
||||
#endif
|
||||
store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
|
||||
eflag,vflag,engv,rdrho,nrho,i,rhomax,tfrho);
|
||||
}
|
||||
@ -498,8 +496,8 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
|
||||
__syncthreads();
|
||||
#else
|
||||
const int oi=ONETYPE*MAX_SHARED_TYPES+ONETYPE;
|
||||
const numtyp type2rhor_z2rx=type2rhor_z2r_in[oi].x;
|
||||
const numtyp type2rhor_z2ry=type2rhor_z2r_in[oi].y;
|
||||
const int type2rhor_z2rx=type2rhor_z2r_in[oi].x;
|
||||
const int type2rhor_z2ry=type2rhor_z2r_in[oi].y;
|
||||
#endif
|
||||
|
||||
int n_stride;
|
||||
|
||||
@ -26,10 +26,10 @@ _texture_2d( pos_tex,int4);
|
||||
|
||||
// LJ quantities scaled by epsilon and rmin = sigma*2^1/6 (see src/pair_lj_cubic.h)
|
||||
|
||||
#define _RT6TWO (numtyp)1.1224621
|
||||
#define _PHIS (numtyp)-0.7869823 /* energy at s */
|
||||
#define _DPHIDS (numtyp)2.6899009 /* gradient at s */
|
||||
#define _A3 (numtyp)27.93357 /* cubic coefficient */
|
||||
#define _RT6TWO (numtyp)1.1224620483093730 /* 2^1/6 */
|
||||
#define _PHIS (numtyp)-0.7869822485207097 /* energy at s */
|
||||
#define _DPHIDS (numtyp)2.6899008972047196 /* gradient at s */
|
||||
#define _A3 (numtyp)27.9335700460986445 /* cubic coefficient */
|
||||
|
||||
__kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict lj1,
|
||||
|
||||
@ -26,8 +26,8 @@
|
||||
|
||||
#if !defined(USE_OPENCL) && !defined(USE_HIP)
|
||||
#ifndef LAL_USE_OLD_NEIGHBOR
|
||||
// Issue with incorrect results with CUDA 11.2
|
||||
#if (CUDA_VERSION > 11019) && (CUDA_VERSION < 11030)
|
||||
// Issue with incorrect results with CUDA >= 11.2
|
||||
#if (CUDA_VERSION > 11019)
|
||||
#define LAL_USE_OLD_NEIGHBOR
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -34,8 +34,8 @@ _texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 2)
|
||||
// Issue with incorrect results in CUDA 11.2
|
||||
#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 2)
|
||||
// Issue with incorrect results in CUDA >= 11.2
|
||||
#define LAL_USE_OLD_NEIGHBOR
|
||||
#endif
|
||||
#endif
|
||||
@ -115,7 +115,7 @@ __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
|
||||
#define tagint int
|
||||
#endif
|
||||
#ifdef LAMMPS_BIGBIG
|
||||
#define tagint long long int
|
||||
#define tagint long
|
||||
#endif
|
||||
#ifdef LAMMPS_SMALLSMALL
|
||||
#define tagint int
|
||||
|
||||
@ -108,10 +108,7 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
|
||||
_nparams = nparams;
|
||||
_nelements = nelements;
|
||||
|
||||
UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
|
||||
UCL_READ_WRITE);
|
||||
host_write.zero();
|
||||
cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
|
||||
_cutsq_max=0.0;
|
||||
for (int ii=1; ii<ntypes; ii++) {
|
||||
const int i=host_map[ii];
|
||||
for (int jj=1; jj<ntypes; jj++) {
|
||||
@ -120,12 +117,10 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
|
||||
const int k=host_map[kk];
|
||||
if (i<0 || j<0 || k<0) continue;
|
||||
const int ijkparam = host_elem2param[i][j][k];
|
||||
if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
|
||||
host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
|
||||
if (host_cutsq[ijkparam]>_cutsq_max) _cutsq_max=host_cutsq[ijkparam];
|
||||
}
|
||||
}
|
||||
}
|
||||
ucl_copy(cutsq_pair,host_write,ntypes*ntypes);
|
||||
|
||||
// --------------------------------------------------------------------
|
||||
UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
|
||||
@ -235,7 +230,6 @@ void TersoffT::clear() {
|
||||
ts3.clear();
|
||||
ts4.clear();
|
||||
ts5.clear();
|
||||
cutsq_pair.clear();
|
||||
map.clear();
|
||||
elem2param.clear();
|
||||
_zetaij.clear();
|
||||
@ -286,7 +280,7 @@ int TersoffT::loop(const int eflag, const int vflag, const int evatom,
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
|
||||
this->k_short_nbor.run(&this->atom->x, &_cutsq_max, &_ntypes,
|
||||
&this->nbor->dev_nbor, &this->nbor->dev_packed,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
|
||||
@ -226,17 +226,13 @@ _texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq_pair,
|
||||
const int ntypes, __global int * dev_nbor,
|
||||
const numtyp cutsq, const int ntypes,
|
||||
__global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom_in) {
|
||||
const int ii=GLOBAL_ID_X;
|
||||
|
||||
#ifdef ONETYPE
|
||||
const numtyp cutsq=cutsq_pair[ONETYPE];
|
||||
#endif
|
||||
|
||||
if (ii<inum) {
|
||||
const int i=dev_packed[ii];
|
||||
int nbor=ii+nbor_pitch;
|
||||
@ -245,9 +241,6 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
#ifndef ONETYPE
|
||||
const int itype=ix.w*ntypes;
|
||||
#endif
|
||||
int newj=0;
|
||||
|
||||
__global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
|
||||
@ -258,11 +251,6 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
|
||||
int j = sj & NEIGHMASK;
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
||||
#ifndef ONETYPE
|
||||
const int mtype=jx.w+itype;
|
||||
const numtyp cutsq=cutsq_pair[mtype];
|
||||
#endif
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
|
||||
@ -73,7 +73,7 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
|
||||
/// ts5.x = beta, ts5.y = powern, ts5.z = lam2, ts5.w = bigb
|
||||
UCL_D_Vec<numtyp4> ts5;
|
||||
|
||||
UCL_D_Vec<numtyp> cutsq_pair;
|
||||
numtyp _cutsq_max;
|
||||
|
||||
UCL_D_Vec<int> elem2param;
|
||||
UCL_D_Vec<int> map;
|
||||
|
||||
@ -142,7 +142,10 @@ ucl_inline numtyp ters_fa_d(const numtyp r,
|
||||
numtyp *ans_d)
|
||||
{
|
||||
#ifndef ONETYPE
|
||||
if (r > param_bigr + param_bigd) return (numtyp)0.0;
|
||||
if (r > param_bigr + param_bigd) {
|
||||
*ans_d = (numtyp)0.0;
|
||||
return (numtyp)0.0;
|
||||
}
|
||||
#endif
|
||||
numtyp dfc;
|
||||
const numtyp fc=ters_fc_d(r,param_bigr,param_bigd,&dfc);
|
||||
|
||||
@ -105,10 +105,7 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
|
||||
_nparams = nparams;
|
||||
_nelements = nelements;
|
||||
|
||||
UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
|
||||
UCL_READ_WRITE);
|
||||
host_write.zero();
|
||||
cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
|
||||
_cutsq_max=0.0;
|
||||
for (int ii=1; ii<ntypes; ii++) {
|
||||
const int i=host_map[ii];
|
||||
for (int jj=1; jj<ntypes; jj++) {
|
||||
@ -117,12 +114,10 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
|
||||
const int k=host_map[kk];
|
||||
if (i<0 || j<0 || k<0) continue;
|
||||
const int ijkparam = host_elem2param[i][j][k];
|
||||
if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
|
||||
host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
|
||||
if (host_cutsq[ijkparam]>_cutsq_max) _cutsq_max=host_cutsq[ijkparam];
|
||||
}
|
||||
}
|
||||
}
|
||||
ucl_copy(cutsq_pair,host_write,ntypes*ntypes);
|
||||
|
||||
UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
@ -229,7 +224,6 @@ void TersoffMT::clear() {
|
||||
ts3.clear();
|
||||
ts4.clear();
|
||||
ts5.clear();
|
||||
cutsq_pair.clear();
|
||||
map.clear();
|
||||
elem2param.clear();
|
||||
_zetaij.clear();
|
||||
@ -275,7 +269,7 @@ int TersoffMT::loop(const int eflag, const int vflag, const int evatom,
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
|
||||
this->k_short_nbor.run(&this->atom->x, &_cutsq_max, &_ntypes,
|
||||
&this->nbor->dev_nbor, &this->nbor->dev_packed,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
|
||||
@ -220,17 +220,13 @@ _texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq_pair,
|
||||
const int ntypes, __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const numtyp cutsq, const int ntypes,
|
||||
__global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int ii=GLOBAL_ID_X;
|
||||
|
||||
#ifdef ONETYPE
|
||||
const numtyp cutsq=cutsq_pair[ONETYPE];
|
||||
#endif
|
||||
|
||||
if (ii<inum) {
|
||||
const int i=dev_packed[ii];
|
||||
int nbor=ii+nbor_pitch;
|
||||
@ -239,9 +235,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
#ifndef ONETYPE
|
||||
const int itype=ix.w*ntypes;
|
||||
#endif
|
||||
int newj=0;
|
||||
|
||||
__global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
|
||||
@ -252,11 +245,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
int j = sj & NEIGHMASK;
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
||||
#ifndef ONETYPE
|
||||
const int mtype=jx.w+itype;
|
||||
const numtyp cutsq=cutsq_pair[mtype];
|
||||
#endif
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
|
||||
@ -76,7 +76,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
|
||||
/// ts5.x = c5, ts5.y = h
|
||||
UCL_D_Vec<numtyp4> ts5;
|
||||
|
||||
UCL_D_Vec<numtyp> cutsq_pair;
|
||||
numtyp _cutsq_max;
|
||||
|
||||
UCL_D_Vec<int> elem2param;
|
||||
UCL_D_Vec<int> map;
|
||||
|
||||
@ -112,10 +112,7 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
|
||||
_nparams = nparams;
|
||||
_nelements = nelements;
|
||||
|
||||
UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
|
||||
UCL_READ_WRITE);
|
||||
host_write.zero();
|
||||
cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
|
||||
_cutsq_max=0.0;
|
||||
for (int ii=1; ii<ntypes; ii++) {
|
||||
const int i=host_map[ii];
|
||||
for (int jj=1; jj<ntypes; jj++) {
|
||||
@ -124,12 +121,10 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
|
||||
const int k=host_map[kk];
|
||||
if (i<0 || j<0 || k<0) continue;
|
||||
const int ijkparam = host_elem2param[i][j][k];
|
||||
if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
|
||||
host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
|
||||
if (host_cutsq[ijkparam]>_cutsq_max) _cutsq_max=host_cutsq[ijkparam];
|
||||
}
|
||||
}
|
||||
}
|
||||
ucl_copy(cutsq_pair,host_write,ntypes*ntypes);
|
||||
|
||||
UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
@ -253,7 +248,6 @@ void TersoffZT::clear() {
|
||||
ts4.clear();
|
||||
ts5.clear();
|
||||
ts6.clear();
|
||||
cutsq_pair.clear();
|
||||
map.clear();
|
||||
elem2param.clear();
|
||||
_zetaij.clear();
|
||||
@ -299,7 +293,7 @@ int TersoffZT::loop(const int eflag, const int vflag, const int evatom,
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
|
||||
this->k_short_nbor.run(&this->atom->x, &_cutsq_max, &_ntypes,
|
||||
&this->nbor->dev_nbor, &this->nbor->dev_packed,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
|
||||
@ -238,17 +238,13 @@ _texture( ts6_tex,int4);
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq_pair,
|
||||
const int ntypes, __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const numtyp cutsq, const int ntypes,
|
||||
__global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int ii=GLOBAL_ID_X;
|
||||
|
||||
#ifdef ONETYPE
|
||||
const numtyp cutsq=cutsq_pair[ONETYPE];
|
||||
#endif
|
||||
|
||||
if (ii<inum) {
|
||||
const int i=dev_packed[ii];
|
||||
int nbor=ii+nbor_pitch;
|
||||
@ -257,9 +253,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
#ifndef ONETYPE
|
||||
const int itype=ix.w*ntypes;
|
||||
#endif
|
||||
int newj=0;
|
||||
|
||||
__global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
|
||||
@ -270,11 +263,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
int j = sj & NEIGHMASK;
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
||||
#ifndef ONETYPE
|
||||
const int mtype=jx.w+itype;
|
||||
const numtyp cutsq=cutsq_pair[mtype];
|
||||
#endif
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
|
||||
@ -80,7 +80,7 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
|
||||
/// ts6.x = Z_i, ts6.y = Z_j, ts6.z = ZBLcut, ts6.w = ZBLexpscale
|
||||
UCL_D_Vec<numtyp4> ts6;
|
||||
|
||||
UCL_D_Vec<numtyp> cutsq_pair;
|
||||
numtyp _cutsq_max;
|
||||
|
||||
UCL_D_Vec<int> elem2param;
|
||||
UCL_D_Vec<int> map;
|
||||
|
||||
Reference in New Issue
Block a user