diff --git a/lib/gpu/lal_answer.h b/lib/gpu/lal_answer.h index 149e8e9705..790d9c1f8d 100644 --- a/lib/gpu/lal_answer.h +++ b/lib/gpu/lal_answer.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -49,14 +49,18 @@ class Answer { inline void inum(const int n) { _inum=n; } /// Return the maximum number of atoms that can be stored currently inline int max_inum() const { return _max_local; } - + /// Return the number of fields used for energy and virial + inline int ev_fields(const int mode) const { + return (mode == 1) ? _ev_fields : _e_fields; + } + /// Memory usage per atom in this class - int bytes_per_atom() const; + int bytes_per_atom() const; /// Clear any previous data and set up for a new LAMMPS run /** \param rot True if atom storage needs quaternions **/ bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev); - + /// Check if we have enough device storage and realloc if not inline void resize(const int inum, bool &success) { _inum=inum; @@ -67,14 +71,14 @@ class Answer { _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes(); } } - + /// If already initialized by another LAMMPS style, add fields as necessary /** \param rot True if atom storage needs quaternions **/ bool add_fields(const bool charge, const bool rot); - + /// Free all memory on host and device void clear(); - + /// Return the total amount of host memory used by class in bytes double host_memory_usage() const; @@ -92,12 +96,12 @@ class Answer { inline double transfer_time() { return time_answer.total_seconds(); } - + /// Return the total time for data cast/pack inline double cast_time() { return _time_cast; } - + /// Return number of bytes used on device - inline double gpu_bytes() { return _gpu_bytes; } + inline double gpu_bytes() { return _gpu_bytes; } // -------------------------COPY FROM GPU ------------------------------- @@ -108,7 +112,7 @@ class Answer { /// Copy answers from device into read buffer asynchronously void copy_answers(const bool eflag, const bool vflag, const bool ef_atom, const bool vf_atom, int *ilist); - + /// Copy energy and virial data into LAMMPS memory double energy_virial(double *eatom, double **vatom, double *virial); @@ -119,7 +123,7 @@ class Answer { /// Add forces and torques from the GPU into a LAMMPS pointer void get_answers(double **f, double **tor); - inline double get_answers(double **f, double **tor, double *eatom, + inline double get_answers(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul) { double ta=MPI_Wtime(); time_answer.sync_stop(); @@ -130,7 +134,7 @@ class Answer { _time_cast+=MPI_Wtime()-ts; return evdw; } - + /// Return the time the CPU was idle waiting for GPU inline double cpu_idle_time() { return _time_cpu_idle; } @@ -143,23 +147,23 @@ class Answer { UCL_Vector force; /// Energy and virial per-atom storage UCL_Vector engv; - + /// Device timers UCL_Timer time_answer; - + /// Geryon device UCL_Device *dev; private: bool alloc(const int inum); - + bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; int _max_local, _inum, _e_fields, _ev_fields, _ans_fields; int *_ilist; double _time_cast, _time_cpu_idle; - + double _gpu_bytes; - + bool _newton; }; diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp index f818f62e26..c41aad7b58 100644 --- a/lib/gpu/lal_base_three.cpp +++ b/lib/gpu/lal_base_three.cpp @@ -194,7 +194,7 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum, resize_atom(inum,nall,success); resize_local(nall,host_inum,nbor->max_nbors(),success); if (!success) - return 1; + return 0; atom->cast_copy_x(host_x,host_type); int mn; diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h index 15ff8aff13..0af290469a 100644 --- a/lib/gpu/lal_base_three.h +++ b/lib/gpu/lal_base_three.h @@ -13,8 +13,8 @@ email : brownw@ornl.gov ***************************************************************************/ -#ifndef LAL_BASE_ATOMIC_H -#define LAL_BASE_ATOMIC_H +#ifndef LAL_BASE_THREE_H +#define LAL_BASE_THREE_H #include "lal_device.h" #include "lal_balance.h" @@ -28,7 +28,7 @@ #include "geryon/nvd_texture.h" #endif -#define THREE_CONCURRENT +//#define THREE_CONCURRENT namespace LAMMPS_AL { diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu index fa56eac819..4492e5f60a 100644 --- a/lib/gpu/lal_sw.cu +++ b/lib/gpu/lal_sw.cu @@ -37,7 +37,7 @@ texture sw3_tex; #define THIRD (numtyp)0.66666667 -#define THREE_CONCURRENT +//#define THREE_CONCURRENT #if (ARCH < 300) diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp index ebf3b2fcb1..fc7ebc4f08 100644 --- a/lib/gpu/lal_tersoff.cpp +++ b/lib/gpu/lal_tersoff.cpp @@ -33,10 +33,10 @@ TersoffT::Tersoff() : BaseThree(), _allocated(false) { } template -TersoffT::~Tersoff() { +TersoffT::~Tersoff() { clear(); } - + template int TersoffT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,11 +45,11 @@ int TersoffT::bytes_per_atom(const int max_nbors) const { template int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int max_nbors, const double cell_size, const double gpu_split, FILE *_screen, - int* host_map, const int nelements, int*** host_elem2param, const int nparams, + int* host_map, const int nelements, int*** host_elem2param, const int nparams, const double* lam1, const double* lam2, const double* lam3,const double* powermint, - const double* biga, const double* bigb, const double* bigr, const double* bigd, - const double* c1, const double* c2, const double* c3, const double* c4, - const double* c, const double* d, const double* h, const double* gamma, + const double* biga, const double* bigb, const double* bigr, const double* bigd, + const double* c1, const double* c2, const double* c3, const double* c4, + const double* c, const double* d, const double* h, const double* gamma, const double* beta, const double* powern, const double* host_cutsq) { int success; @@ -62,11 +62,7 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int int ef_nall=nall; if (ef_nall==0) ef_nall=2000; - - _max_zij_size=static_cast(static_cast(ef_nall)*1.10); - _zetaij.alloc(_max_zij_size*max_nbors,*(this->ucl_device),UCL_READ_WRITE); - zeta_tex.get_texture(*(this->pair_program),"zeta_tex"); - zeta_tex.bind_float(_zetaij,1); + _zetaij.alloc(ef_nall*max_nbors,*(this->ucl_device),UCL_READ_WRITE); k_zeta.set_function(*(this->pair_program),"k_tersoff_zeta"); @@ -87,74 +83,74 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int UCL_WRITE_ONLY); for (int i=0; iucl_device),UCL_READ_ONLY); - + for (int i=0; i(lam1[i]); dview[i].y=static_cast(lam2[i]); dview[i].z=static_cast(lam3[i]); dview[i].w=static_cast(powermint[i]); } - + ucl_copy(ts1,dview,false); ts1_tex.get_texture(*(this->pair_program),"ts1_tex"); ts1_tex.bind_float(ts1,4); ts2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); - + for (int i=0; i(biga[i]); dview[i].y=static_cast(bigb[i]); dview[i].z=static_cast(bigr[i]); dview[i].w=static_cast(bigd[i]); } - + ucl_copy(ts2,dview,false); ts2_tex.get_texture(*(this->pair_program),"ts2_tex"); ts2_tex.bind_float(ts2,4); ts3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); - + for (int i=0; i(c1[i]); dview[i].y=static_cast(c2[i]); dview[i].z=static_cast(c3[i]); dview[i].w=static_cast(c4[i]); } - + ucl_copy(ts3,dview,false); ts3_tex.get_texture(*(this->pair_program),"ts3_tex"); ts3_tex.bind_float(ts3,4); ts4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); - + for (int i=0; i(c[i]); dview[i].y=static_cast(d[i]); dview[i].z=static_cast(h[i]); dview[i].w=static_cast(gamma[i]); } - + ucl_copy(ts4,dview,false); ts4_tex.get_texture(*(this->pair_program),"ts4_tex"); ts4_tex.bind_float(ts4,4); ts5.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); - + for (int i=0; i(beta[i]); dview[i].y=static_cast(powern[i]); dview[i].z=(numtyp)0; dview[i].w=(numtyp)0; } - + ucl_copy(ts5,dview,false); ts5_tex.get_texture(*(this->pair_program),"ts5_tex"); ts5_tex.bind_float(ts5,4); @@ -227,11 +223,11 @@ double TersoffT::host_memory_usage() const { // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template -void TersoffT::compute(const int f_ago, const int nlocal, const int nall, +void TersoffT::compute(const int f_ago, const int nlocal, const int nall, const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, + int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, + const bool vatom, int &host_start, const double cpu_time, bool &success) { this->acc_timers(); if (nlist==0) { @@ -254,6 +250,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall, this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); if (!success) return; + _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); } this->atom->cast_x_data(host_x,host_type); @@ -261,24 +258,28 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall, this->atom->add_x_data(host_x,host_type); // re-allocate zetaij if necessary - if (nall>_max_zij_size) { - _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); - _max_zij_size=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_max_zij_size); - zeta_tex.bind_float(_zetaij,1); + if (nall*_max_nbors > _zetaij.cols()) { + int _nmax=static_cast(static_cast(nall)*1.10); + _zetaij.resize(_max_nbors*_nmax); } + int _eflag; + if (eflag) + _eflag=1; + else + _eflag=0; + int ainum=nall; int nbor_pitch=this->nbor->nbor_pitch(); int BX=this->block_pair(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/this->_threads_per_atom))); + int GX=static_cast(ceil(static_cast(ainum)/ + (BX/(JTHREADS*KTHREADS)))); this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, + this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &nall, &ainum, &nbor_pitch, &this->_threads_per_atom); + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom); int evatom=0; if (eatom || vatom) @@ -303,7 +304,7 @@ template int ** TersoffT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, + int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, @@ -317,7 +318,7 @@ int ** TersoffT::compute(const int ago, const int inum_full, this->zero_timers(); return NULL; } - + this->hd_balancer.balance(cpu_time); int inum=this->hd_balancer.get_gpu_count(ago,inum_full); this->ans->inum(inum); @@ -325,7 +326,7 @@ int ** TersoffT::compute(const int ago, const int inum_full, this->ans2->inum(inum); #endif host_start=inum; - + // Build neighbor list on GPU if necessary if (ago==0) { _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, @@ -342,23 +343,28 @@ int ** TersoffT::compute(const int ago, const int inum_full, *jnum=this->nbor->host_acc.begin(); // re-allocate zetaij if necessary - if (nall>_max_zij_size) { - _max_zij_size=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_max_zij_size); - zeta_tex.bind_float(_zetaij,1); + if (nall*_max_nbors > _zetaij.cols()) { + int _nmax=static_cast(static_cast(nall)*1.10); + _zetaij.resize(_max_nbors*_nmax); } + int _eflag; + if (eflag) + _eflag=1; + else + _eflag=0; + int ainum=nall; int nbor_pitch=this->nbor->nbor_pitch(); int BX=this->block_pair(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/this->_threads_per_atom))); + int GX=static_cast(ceil(static_cast(ainum)/ + (BX/(JTHREADS*KTHREADS)))); this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, - &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &nall, &ainum, &nbor_pitch, &this->_threads_per_atom); + this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, + &map, &elem2param, &_nelements, &_nparams, &_zetaij, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom); int evatom=0; if (eatom || vatom) @@ -374,7 +380,7 @@ int ** TersoffT::compute(const int ago, const int inum_full, this->device->add_ans_object(this->ans2); #endif this->hd_balancer.stop_timer(); - + return this->nbor->host_jlist.begin()-host_start; } @@ -403,21 +409,21 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->time_pair.start(); this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq, + this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq, &map, &elem2param, &_nelements, &_nparams, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); BX=this->block_size(); GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/(KTHREADS*JTHREADS)))); + (BX/(KTHREADS*JTHREADS)))); this->k_three_center.set_size(GX,BX); - this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, + this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); Answer *end_ans; @@ -428,20 +434,19 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { #endif if (evatom!=0) { this->k_three_end_vatom.set_size(GX,BX); - this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, + &map, &elem2param, &_nelements, &_nparams, &_zetaij, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_three_end.set_size(GX,BX); - this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, + this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, + &map, &elem2param, &_nelements, &_nparams, &_zetaij, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - } this->time_pair.stop(); diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index e6c6d5db75..0175f3fcf7 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -13,8 +13,6 @@ // email : ndactrung@gmail.com // ***************************************************************************/ -#define THREE_CONCURRENT - #ifdef NV_KERNEL #include "lal_tersoff_extra.h" @@ -25,7 +23,6 @@ texture ts2_tex; texture ts3_tex; texture ts4_tex; texture ts5_tex; -texture zeta_tex; #else texture pos_tex; texture ts1_tex; @@ -33,7 +30,6 @@ texture ts2_tex; texture ts3_tex; texture ts4_tex; texture ts5_tex; -texture zeta_tex; #endif #else @@ -43,17 +39,27 @@ texture zeta_tex; #define ts3_tex ts3 #define ts4_tex ts4 #define ts5_tex ts5 -#define zeta_tex zetaij #endif +//#define THREE_CONCURRENT + #define THIRD (numtyp)0.66666667 +#define zeta_idx(nbor_mem, packed_mem, nbor_pitch, n_stride, t_per_atom, \ + i, nbor_j, offset_j, idx) \ + if (nbor_mem==packed_mem) { \ + int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; \ + idx = jj*n_stride + i*t_per_atom + offset_j; \ + } else { \ + idx = nbor_j; \ + } + #if (ARCH < 300) -#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ - eflag, vflag, ans, engv) \ +#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ + offset, eflag, vflag, ans, engv) \ if (t_per_atom>1) { \ - __local acctyp red_acc[6][BLOCK_ELLIPSE]; \ + __local acctyp red_acc[6][BLOCK_PAIR]; \ red_acc[0][tid]=f.x; \ red_acc[1][tid]=f.y; \ red_acc[2][tid]=f.z; \ @@ -100,16 +106,27 @@ texture zeta_tex; ans[ii]=old; \ } +#define store_zeta(z, tid, t_per_atom, offset) \ + if (t_per_atom>1) { \ + red_acc[tid]=z; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + red_acc[tid] += red_acc[tid+s]; \ + } \ + } \ + z=red_acc[tid]; \ + } + #else -#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ - eflag, vflag, ans, engv) \ +#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ + offset, eflag, vflag, ans, engv) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - f.x += shfl_xor(f.x, s, t_per_atom); \ - f.y += shfl_xor(f.y, s, t_per_atom); \ - f.z += shfl_xor(f.z, s, t_per_atom); \ - energy += shfl_xor(energy, s, t_per_atom); \ + f.x += shfl_xor(f.x, s, t_per_atom); \ + f.y += shfl_xor(f.y, s, t_per_atom); \ + f.z += shfl_xor(f.z, s, t_per_atom); \ + energy += shfl_xor(energy, s, t_per_atom); \ } \ if (vflag>0) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ @@ -137,43 +154,65 @@ texture zeta_tex; ans[ii]=old; \ } +#define store_zeta(z, tid, t_per_atom, offset) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + z += shfl_xor(z, s, t_per_atom); \ + } \ + } + #endif // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries // while the block size should never be less than 32. // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block // must be increased if there will be more than 3 elements in the future. + #define SHARED_SIZE 32 __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts1_in, const __global numtyp4 *restrict ts2_in, + const __global numtyp4 *restrict ts3_in, const __global numtyp4 *restrict ts4_in, + const __global numtyp4 *restrict ts5_in, const __global numtyp *restrict cutsq, - const __global int *restrict map, - const __global int *restrict elem2param, + const __global int *restrict map_in, + const __global int *restrict elem2param_in, const int nelements, const int nparams, - __global numtyp * zetaij, - const __global int * dev_nbor, - const __global int * dev_packed, - const int nall, const int inum, + __global numtyp4 * zetaij, + const __global int * dev_nbor, + const __global int * dev_packed, + const int eflag, const int nall, const int inum, const int nbor_pitch, const int t_per_atom) { __local int tpa_sq,n_stride; tpa_sq = fast_mul(t_per_atom,t_per_atom); int tid, ii, offset; atom_info(tpa_sq,ii,tid,offset); - + // must be increased if there will be more than 3 elements in the future. __local numtyp4 ts1[SHARED_SIZE]; __local numtyp4 ts2[SHARED_SIZE]; + __local numtyp4 ts3[SHARED_SIZE]; __local numtyp4 ts4[SHARED_SIZE]; + __local numtyp4 ts5[SHARED_SIZE]; + __local int elem2param[SHARED_SIZE]; + __local int map[SHARED_SIZE]; if (tid cutsq[ijparam]) continue; - + // compute zeta_ij - numtyp z = (numtyp)0; + z = (numtyp)0; + int nbor_k = nborj_start-offset_j+offset_k; - for ( ; nbor_k < nbor_end; nbor_k+=n_stride) { + for ( ; nbor_k < nbor_end; nbor_k+=n_stride) { int k=dev_packed[nbor_k]; k &= NEIGHMASK; - + if (k == j) continue; - + numtyp4 kx; fetch4(kx,k,pos_tex); //x_[k]; int ktype=kx.w; ktype=map[ktype]; - + // Compute rik delr2.x = kx.x-ix.x; delr2.y = kx.y-ix.y; delr2.z = kx.z-ix.z; numtyp rsq2 = delr2.x*delr2.x+delr2.y*delr2.y+delr2.z*delr2.z; - - int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype]; + + int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype]; if (rsq2 > cutsq[ijkparam]) continue; - + numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex); numtyp ijkparam_lam3 = ts1_ijkparam.z; numtyp ijkparam_powermint = ts1_ijkparam.w; @@ -241,17 +281,48 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_, numtyp ijkparam_c = ts4_ijkparam.x; numtyp ijkparam_d = ts4_ijkparam.y; numtyp ijkparam_h = ts4_ijkparam.z; - numtyp ijkparam_gamma = ts4_ijkparam.w; + numtyp ijkparam_gamma = ts4_ijkparam.w; z += zeta(ijkparam_powermint, ijkparam_lam3, ijkparam_bigr, ijkparam_bigd, ijkparam_c, ijkparam_d, ijkparam_h, ijkparam_gamma, rsq1, rsq2, delr1, delr2); } - int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; - int idx = jj*n_stride + i*t_per_atom + offset_j; - zetaij[idx] = z; - } // for nbor + //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; + //int idx = jj*n_stride + i*t_per_atom + offset_j; + int idx; + zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, + i, nbor_j, offset_j, idx); + store_zeta(z, tid, t_per_atom, offset_k); + numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex); + numtyp ijparam_lam2 = ts1_ijparam.y; + numtyp4 ts2_ijparam = ts2[ijparam]; //fetch4(ts2_ijparam,ijparam,ts2_tex); + numtyp ijparam_bigb = ts2_ijparam.y; + numtyp ijparam_bigr = ts2_ijparam.z; + numtyp ijparam_bigd = ts2_ijparam.w; + numtyp4 ts3_ijparam = ts3[ijparam]; //fetch4(ts3_ijparam,ijparam,ts3_tex); + numtyp ijparam_c1 = ts3_ijparam.x; + numtyp ijparam_c2 = ts3_ijparam.y; + numtyp ijparam_c3 = ts3_ijparam.z; + numtyp ijparam_c4 = ts3_ijparam.w; + numtyp4 ts5_ijparam = ts5[ijparam]; //fetch4(ts5_ijparam,ijparam,ts5_tex); + numtyp ijparam_beta = ts5_ijparam.x; + numtyp ijparam_powern = ts5_ijparam.y; + + if (offset_k == 0) { + numtyp fpfeng[4]; + force_zeta(ijparam_bigb, ijparam_bigr, ijparam_bigd, ijparam_lam2, + ijparam_beta, ijparam_powern, ijparam_c1, ijparam_c2, ijparam_c3, + ijparam_c4, rsq1, z, eflag, fpfeng); + numtyp4 zij; + zij.x = fpfeng[0]; + zij.y = fpfeng[1]; + zij.z = fpfeng[2]; + zij.w = z; + zetaij[idx] = zij; + } + + } // for nbor } // if ii } @@ -259,15 +330,15 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts1_in, const __global numtyp4 *restrict ts2_in, const __global numtyp *restrict cutsq, - const __global int *restrict map, - const __global int *restrict elem2param, + const __global int *restrict map_in, + const __global int *restrict elem2param_in, const int nelements, const int nparams, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom) { __local int n_stride; int tid, ii, offset; @@ -275,9 +346,13 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, __local numtyp4 ts1[SHARED_SIZE]; __local numtyp4 ts2[SHARED_SIZE]; + __local int elem2param[SHARED_SIZE]; + __local int map[SHARED_SIZE]; if (tid0) - energy+=feng[1]; + if (eflag>0) + energy+=feng[1]; if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; @@ -352,41 +427,40 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, } -__kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, +__kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts1_in, const __global numtyp4 *restrict ts2_in, - const __global numtyp4 *restrict ts3_in, const __global numtyp4 *restrict ts4_in, - const __global numtyp4 *restrict ts5_in, const __global numtyp *restrict cutsq, - const __global int *restrict map, - const __global int *restrict elem2param, + const __global int *restrict map_in, + const __global int *restrict elem2param_in, const int nelements, const int nparams, - const __global numtyp *restrict zetaij, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global numtyp4 *restrict zetaij, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom, const int evatom) { __local int tpa_sq, n_stride; tpa_sq=fast_mul(t_per_atom,t_per_atom); + numtyp lam3, powermint, bigr, bigd, c, d, h, gamma; int tid, ii, offset; atom_info(tpa_sq,ii,tid,offset); // offset ranges from 0 to tpa_sq-1 __local numtyp4 ts1[SHARED_SIZE]; __local numtyp4 ts2[SHARED_SIZE]; - __local numtyp4 ts3[SHARED_SIZE]; __local numtyp4 ts4[SHARED_SIZE]; - __local numtyp4 ts5[SHARED_SIZE]; + __local int elem2param[SHARED_SIZE]; + __local int map[SHARED_SIZE]; if (tid cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); - numtyp r1inv = ucl_recip(r1); - + numtyp r1inv = ucl_rsqrt(rsq1); + // look up for zeta_ij - int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride; - int idx = jj*n_stride + i*t_per_atom + offset_j; - numtyp zeta_ij = (numtyp)0; - fetch(zeta_ij,idx,zeta_tex); // zeta_ij = zetaij[idx]; - - numtyp fpfeng[4]; - { - numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex); - numtyp ijparam_lam2 = ts1_ijparam.y; - numtyp4 ts2_ijparam = ts2[ijparam]; //fetch4(ts2_ijparam,ijparam,ts2_tex); - numtyp ijparam_bigb = ts2_ijparam.y; - numtyp ijparam_bigr = ts2_ijparam.z; - numtyp ijparam_bigd = ts2_ijparam.w; - numtyp4 ts3_ijparam = ts3[ijparam]; //fetch4(ts3_ijparam,ijparam,ts3_tex); - numtyp ijparam_c1 = ts3_ijparam.x; - numtyp ijparam_c2 = ts3_ijparam.y; - numtyp ijparam_c3 = ts3_ijparam.z; - numtyp ijparam_c4 = ts3_ijparam.w; - numtyp4 ts5_ijparam = ts5[ijparam]; //fetch4(ts5_ijparam,ijparam,ts5_tex); - numtyp ijparam_beta = ts5_ijparam.x; - numtyp ijparam_powern = ts5_ijparam.y; - force_zeta(ijparam_bigb, ijparam_bigr, ijparam_bigd, ijparam_lam2, - ijparam_beta, ijparam_powern, ijparam_c1, ijparam_c2, ijparam_c3, - ijparam_c4, rsq1, zeta_ij, eflag, fpfeng); - } - numtyp force = fpfeng[0]; + //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride; + //int idx = jj*n_stride + i*t_per_atom + offset_j; + int idx; + zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, + i, nbor_j, offset_j, idx); + numtyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex); + numtyp force = zeta_ij.x*tpainv; + numtyp prefactor = zeta_ij.y; f.x += delr1[0]*force; f.y += delr1[1]*force; f.z += delr1[2]*force; - numtyp prefactor = fpfeng[1]; if (eflag>0) { - energy+=fpfeng[2]; + energy+=zeta_ij.z*tpainv; } if (vflag>0) { numtyp mforce = -force; @@ -498,32 +554,26 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype]; if (rsq2 > cutsq[ijkparam]) continue; numtyp r2 = ucl_sqrt(rsq2); - numtyp r2inv = ucl_recip(r2); + numtyp r2inv = ucl_rsqrt(rsq2); numtyp fi[3], fj[3], fk[3]; - { numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex); - numtyp ijkparam_lam3 = ts1_ijkparam.z; - numtyp ijkparam_powermint = ts1_ijkparam.w; + lam3 = ts1_ijkparam.z; + powermint = ts1_ijkparam.w; numtyp4 ts2_ijkparam = ts2[ijkparam]; //fetch4(ts2_ijkparam,ijkparam,ts2_tex); - numtyp ijkparam_bigr = ts2_ijkparam.z; - numtyp ijkparam_bigd = ts2_ijkparam.w; + bigr = ts2_ijkparam.z; + bigd = ts2_ijkparam.w; numtyp4 ts4_ijkparam = ts4[ijkparam]; //fetch4(ts4_ijkparam,ijkparam,ts4_tex); - numtyp ijkparam_c = ts4_ijkparam.x; - numtyp ijkparam_d = ts4_ijkparam.y; - numtyp ijkparam_h = ts4_ijkparam.z; - numtyp ijkparam_gamma = ts4_ijkparam.w; - if (vflag>0) - attractive(ijkparam_bigr, ijkparam_bigd, ijkparam_powermint, - ijkparam_lam3, ijkparam_c, ijkparam_d, ijkparam_h, - ijkparam_gamma, prefactor, r1, r1inv, r2, r2inv, delr1, delr2, - fi, fj, fk); - else - attractive_fi(ijkparam_bigr, ijkparam_bigd, ijkparam_powermint, - ijkparam_lam3, ijkparam_c, ijkparam_d, ijkparam_h, - ijkparam_gamma, prefactor, r1, r1inv, r2, r2inv, delr1, delr2, - fi); - } + c = ts4_ijkparam.x; + d = ts4_ijkparam.y; + h = ts4_ijkparam.z; + gamma = ts4_ijkparam.w; + if (vflag>0) + attractive(bigr, bigd, powermint, lam3, c, d, h, gamma, + prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk); + else + attractive_fi(bigr, bigd, powermint, lam3, c, d, h, gamma, + prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi); f.x += fi[0]; f.y += fi[1]; f.z += fi[2]; @@ -545,48 +595,47 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, } // nbor_k } // for nbor_j - store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom, + store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq, offset,eflag,vflag,ans,engv); } // if ii } -__kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, +__kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts1_in, const __global numtyp4 *restrict ts2_in, - const __global numtyp4 *restrict ts3_in, const __global numtyp4 *restrict ts4_in, - const __global numtyp4 *restrict ts5_in, const __global numtyp *restrict cutsq, - const __global int *restrict map, - const __global int *restrict elem2param, + const __global int *restrict map_in, + const __global int *restrict elem2param_in, const int nelements, const int nparams, - const __global numtyp *restrict zetaij, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global numtyp4 *restrict zetaij, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom) { __local int tpa_sq, n_stride; tpa_sq=fast_mul(t_per_atom,t_per_atom); + numtyp lam3, powermint, bigr, bigd, c, d, h, gamma; int tid, ii, offset; atom_info(tpa_sq,ii,tid,offset); __local numtyp4 ts1[SHARED_SIZE]; __local numtyp4 ts2[SHARED_SIZE]; - __local numtyp4 ts3[SHARED_SIZE]; __local numtyp4 ts4[SHARED_SIZE]; - __local numtyp4 ts5[SHARED_SIZE]; + __local int elem2param[SHARED_SIZE]; + __local int map[SHARED_SIZE]; if (tid cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); - numtyp r1inv = ucl_recip(r1); + numtyp r1inv = ucl_rsqrt(rsq1); numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; @@ -636,7 +687,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, mdelr1[2] = -delr1[2]; int nbor_k=j+nbor_pitch; - int numk=dev_nbor[nbor_k]; + int numk=dev_nbor[nbor_k]; if (dev_nbor==dev_packed) { nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); @@ -650,50 +701,41 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, int nbork_start = nbor_k; // look up for zeta_ji: find i in the j's neighbor list - int ijnum = 0; - for ( ; nbor_k= 0) { + offset_kf = offset_k; + } else { + ijnum = red_acc[2*m+0]; + offset_kf = red_acc[2*m+1]; } - numtyp force = fpfeng[0]; + + //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride; + //int idx = iix*n_stride + j*t_per_atom + offset_kf; + int idx; + zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, + j, ijnum, offset_kf, idx); + numtyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex); + numtyp force = zeta_ji.x*tpainv; + numtyp prefactor_ji = zeta_ji.y; f.x += delr1[0]*force; f.y += delr1[1]*force; f.z += delr1[2]*force; - numtyp prefactor = fpfeng[1]; if (eflag>0) { - energy+=fpfeng[2]; + energy+=zeta_ji.z*tpainv; } if (vflag>0) { numtyp mforce = -force; @@ -706,7 +748,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, } // attractive forces - for (nbor_k = nbork_start ; nbor_k cutsq[jikparam]) continue; numtyp r2 = ucl_sqrt(rsq2); - numtyp r2inv = ucl_recip(r2); + numtyp r2inv = ucl_rsqrt(rsq2); + numtyp4 ts1_param, ts2_param, ts4_param; + numtyp fi[3]; - numtyp fj[3], fk[3]; - { - numtyp4 ts1_jikparam = ts1[jikparam]; //fetch4(ts1_jikparam,jikparam,ts1_tex); - numtyp jikparam_lam3 = ts1_jikparam.z; - numtyp jikparam_powermint = ts1_jikparam.w; - numtyp4 ts2_jikparam = ts2[jikparam]; //fetch4(ts2_jikparam,jikparam,ts2_tex); - numtyp jikparam_bigr = ts2_jikparam.z; - numtyp jikparam_bigd = ts2_jikparam.w; - numtyp4 ts4_jikparam = ts4[jikparam]; //fetch4(ts4_jikparam,jikparam,ts4_tex); - numtyp jikparam_c = ts4_jikparam.x; - numtyp jikparam_d = ts4_jikparam.y; - numtyp jikparam_h = ts4_jikparam.z; - numtyp jikparam_gamma = ts4_jikparam.w; - attractive_fj(jikparam_bigr, jikparam_bigd, jikparam_powermint, - jikparam_lam3, jikparam_c, jikparam_d, jikparam_h, - jikparam_gamma, prefactor, r1, r1inv, r2, r2inv, mdelr1, delr2, - fj); - } - f.x += fj[0]; - f.y += fj[1]; - f.z += fj[2]; - - int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride; // tpa = 1 - int idx = kk*n_stride + j*t_per_atom + offset_k; - numtyp zeta_jk = (numtyp)0; - fetch(zeta_jk,idx,zeta_tex); // zetaij[idx]; - - numtyp prefactor_jk; - int jkparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype]; - { - numtyp4 ts1_jkparam = ts1[jkparam]; //fetch4(ts1_jkparam,jkparam,ts1_tex); - numtyp jkparam_lam2 = ts1_jkparam.y; - numtyp4 ts2_jkparam = ts2[jkparam]; //fetch4(ts2_jkparam,jkparam,ts2_tex); - numtyp jkparam_bigb = ts2_jkparam.y; - numtyp jkparam_bigr = ts2_jkparam.z; - numtyp jkparam_bigd = ts2_jkparam.w; - numtyp4 ts3_jkparam = ts3[jkparam]; //fetch4(ts3_jkparam,jkparam,ts3_tex); - numtyp jkparam_c1 = ts3_jkparam.x; - numtyp jkparam_c2 = ts3_jkparam.y; - numtyp jkparam_c3 = ts3_jkparam.z; - numtyp jkparam_c4 = ts3_jkparam.w; - numtyp4 ts5_jkparam = ts5[jkparam]; //fetch4(ts5_jkparam,jkparam,ts5_tex); - numtyp jkparam_beta = ts5_jkparam.x; - numtyp jkparam_powern = ts5_jkparam.y; - prefactor_jk = (numtyp)-0.5 * - ters_fa(r2, jkparam_bigb, jkparam_bigr, jkparam_bigd, jkparam_lam2) * - ters_bij_d(zeta_jk, jkparam_beta, jkparam_powern, - jkparam_c1, jkparam_c2, jkparam_c3, jkparam_c4); - } + ts1_param = ts1[jikparam]; //fetch4(ts1_jikparam,jikparam,ts1_tex); + lam3 = ts1_param.z; + powermint = ts1_param.w; + ts2_param = ts2[jikparam]; //fetch4(ts2_jikparam,jikparam,ts2_tex); + bigr = ts2_param.z; + bigd = ts2_param.w; + ts4_param = ts4[jikparam]; //fetch4(ts4_jikparam,jikparam,ts4_tex); + c = ts4_param.x; + d = ts4_param.y; + h = ts4_param.z; + gamma = ts4_param.w; + attractive_fj(bigr, bigd, powermint, lam3, c, d, h, gamma, + prefactor_ji, r1, r1inv, r2, r2inv, mdelr1, delr2, fi); + f.x += fi[0]; + f.y += fi[1]; + f.z += fi[2]; + //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride; + //int idx = kk*n_stride + j*t_per_atom + offset_k; + int idx; + zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, + j, nbor_k, offset_k, idx); + numtyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); + numtyp prefactor_jk = zeta_jk.y; int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype]; - { - numtyp4 ts1_jkiparam = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex); - numtyp jkiparam_lam3 = ts1_jkiparam.z; - numtyp jkiparam_powermint = ts1_jkiparam.w; - numtyp4 ts2_jkiparam = ts2[jkiparam]; //fetch4(ts2_jkiparam,jkiparam,ts2_tex); - numtyp jkiparam_bigr = ts2_jkiparam.z; - numtyp jkiparam_bigd = ts2_jkiparam.w; - numtyp4 ts4_jkiparam = ts4[jkiparam]; //fetch4(ts4_jkiparam,jkiparam,ts4_tex); - numtyp jkiparam_c = ts4_jkiparam.x; - numtyp jkiparam_d = ts4_jkiparam.y; - numtyp jkiparam_h = ts4_jkiparam.z; - numtyp jkiparam_gamma = ts4_jkiparam.w; - attractive_fk(jkiparam_bigr, jkiparam_bigd, jkiparam_powermint, - jkiparam_lam3,jkiparam_c, jkiparam_d, jkiparam_h, - jkiparam_gamma, prefactor_jk, r2, r2inv, r1, r1inv, - delr2, mdelr1, fk); - } - f.x += fk[0]; - f.y += fk[1]; - f.z += fk[2]; - } - } // for nbor + ts1_param = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex); + lam3 = ts1_param.z; + powermint = ts1_param.w; + ts2_param = ts2[jkiparam]; //fetch4(ts2_jkiparam,jkiparam,ts2_tex); + bigr = ts2_param.z; + bigd = ts2_param.w; + ts4_param = ts4[jkiparam]; //fetch4(ts4_jkiparam,jkiparam,ts4_tex); + c = ts4_param.x; + d = ts4_param.y; + h = ts4_param.z; + gamma = ts4_param.w; + attractive_fk(bigr, bigd, powermint, lam3, c, d, h, gamma, + prefactor_jk, r2, r2inv, r1, r1inv, delr2, mdelr1, fi); + f.x += fi[0]; + f.y += fi[1]; + f.z += fi[2]; + } // for nbor_k + } // for nbor_j #ifdef THREE_CONCURRENT store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset, @@ -809,45 +822,43 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset, eflag,vflag,ans,engv); #endif - } // if ii } -__kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, +__kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict ts1_in, const __global numtyp4 *restrict ts2_in, - const __global numtyp4 *restrict ts3_in, const __global numtyp4 *restrict ts4_in, - const __global numtyp4 *restrict ts5_in, const __global numtyp *restrict cutsq, - const __global int *restrict map, - const __global int *restrict elem2param, + const __global int *restrict map_in, + const __global int *restrict elem2param_in, const int nelements, const int nparams, - const __global numtyp *restrict zetaij, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global numtyp4 *restrict zetaij, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom) { __local int tpa_sq, n_stride; tpa_sq=fast_mul(t_per_atom,t_per_atom); + numtyp lam3, powermint, bigr, bigd, c, d, h, gamma; int tid, ii, offset; atom_info(tpa_sq,ii,tid,offset); - + __local numtyp4 ts1[SHARED_SIZE]; __local numtyp4 ts2[SHARED_SIZE]; - __local numtyp4 ts3[SHARED_SIZE]; __local numtyp4 ts4[SHARED_SIZE]; - __local numtyp4 ts5[SHARED_SIZE]; + __local int elem2param[SHARED_SIZE]; + __local int map[SHARED_SIZE]; if (tid cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); - numtyp r1inv = ucl_recip(r1); + numtyp r1inv = ucl_rsqrt(rsq1); numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; @@ -899,7 +912,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, mdelr1[2] = -delr1[2]; int nbor_k=j+nbor_pitch; - int numk=dev_nbor[nbor_k]; + int numk=dev_nbor[nbor_k]; if (dev_nbor==dev_packed) { nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); @@ -913,50 +926,41 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, int nbork_start = nbor_k; // look up for zeta_ji - int ijnum = 0; + int m = tid / t_per_atom; + int ijnum = -1; for ( ; nbor_k= 0) { + offset_kf = offset_k; + } else { + ijnum = red_acc[2*m+0]; + offset_kf = red_acc[2*m+1]; } - numtyp force = fpfeng[0]; + + //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride; + //int idx = iix*n_stride + j*t_per_atom + offset_kf; + int idx; + zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, + j, ijnum, offset_kf, idx); + numtyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex); + numtyp force = zeta_ji.x*tpainv; + numtyp prefactor = zeta_ji.y; f.x += delr1[0]*force; f.y += delr1[1]*force; f.z += delr1[2]*force; - numtyp prefactor = fpfeng[1]; if (eflag>0) { - energy+=fpfeng[2]; + energy+=zeta_ji.z*tpainv; } if (vflag>0) { numtyp mforce = -force; @@ -969,7 +973,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, } // attractive forces - for (nbor_k = nbork_start; nbor_k cutsq[jikparam]) continue; numtyp r2 = ucl_sqrt(rsq2); - numtyp r2inv = ucl_recip(r2); + numtyp r2inv = ucl_rsqrt(rsq2); numtyp fi[3], fj[3], fk[3]; - { - numtyp4 ts1_jikparam = ts1[jikparam]; //fetch4(ts1_jikparam,jikparam,ts1_tex); - numtyp jikparam_lam3 = ts1_jikparam.z; - numtyp jikparam_powermint = ts1_jikparam.w; - numtyp4 ts2_jikparam = ts2[jikparam]; //fetch4(ts2_jikparam,jikparam,ts2_tex); - numtyp jikparam_bigr = ts2_jikparam.z; - numtyp jikparam_bigd = ts2_jikparam.w; - numtyp4 ts4_jikparam = ts4[jikparam]; //fetch4(ts4_jikparam,jikparam,ts4_tex); - numtyp jikparam_c = ts4_jikparam.x; - numtyp jikparam_d = ts4_jikparam.y; - numtyp jikparam_h = ts4_jikparam.z; - numtyp jikparam_gamma = ts4_jikparam.w; - attractive(jikparam_bigr, jikparam_bigd, jikparam_powermint, - jikparam_lam3, jikparam_c, jikparam_d, jikparam_h, - jikparam_gamma, prefactor, r1, r1inv, r2, r2inv, mdelr1, delr2, - fi, fj, fk); - } + numtyp4 ts1_param, ts2_param, ts4_param; + ts1_param = ts1[jikparam]; //fetch4(ts1_jikparam,jikparam,ts1_tex); + lam3 = ts1_param.z; + powermint = ts1_param.w; + ts2_param = ts2[jikparam]; //fetch4(ts2_jikparam,jikparam,ts2_tex); + bigr = ts2_param.z; + bigd = ts2_param.w; + ts4_param = ts4[jikparam]; //fetch4(ts4_jikparam,jikparam,ts4_tex); + c = ts4_param.x; + d = ts4_param.y; + h = ts4_param.z; + gamma = ts4_param.w; + attractive(bigr, bigd, powermint, lam3, c, d, h, gamma, + prefactor, r1, r1inv, r2, r2inv, mdelr1, delr2, fi, fj, fk); f.x += fj[0]; f.y += fj[1]; f.z += fj[2]; @@ -1020,52 +1020,28 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, virial[4] += THIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]); virial[5] += THIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]); - int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride; - int idx = kk*n_stride + j*t_per_atom + offset_k; - numtyp zeta_jk = (numtyp)0; - fetch(zeta_jk,idx,zeta_tex); // zetaij[idx]; - - numtyp prefactor_jk; - int jkparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype]; - { - numtyp4 ts1_jkparam = ts1[jkparam]; //fetch4(ts1_jkparam,jkparam,ts1_tex); - numtyp jkparam_lam2 = ts1_jkparam.y; - numtyp4 ts2_jkparam = ts2[jkparam]; //fetch4(ts2_jkparam,jkparam,ts2_tex); - numtyp jkparam_bigb = ts2_jkparam.y; - numtyp jkparam_bigr = ts2_jkparam.z; - numtyp jkparam_bigd = ts2_jkparam.w; - numtyp4 ts3_jkparam = ts3[jkparam]; //fetch4(ts3_jkparam,jkparam,ts3_tex); - numtyp jkparam_c1 = ts3_jkparam.x; - numtyp jkparam_c2 = ts3_jkparam.y; - numtyp jkparam_c3 = ts3_jkparam.z; - numtyp jkparam_c4 = ts3_jkparam.w; - numtyp4 ts5_jkparam = ts5[jkparam]; //fetch4(ts5_jkparam,jkparam,ts5_tex); - numtyp jkparam_beta = ts5_jkparam.x; - numtyp jkparam_powern = ts5_jkparam.y; - prefactor_jk = (numtyp)-0.5 * - ters_fa(r2, jkparam_bigb, jkparam_bigr, jkparam_bigd, jkparam_lam2) * - ters_bij_d(zeta_jk, jkparam_beta, jkparam_powern, - jkparam_c1, jkparam_c2, jkparam_c3, jkparam_c4); - } + //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride; + //int idx = kk*n_stride + j*t_per_atom + offset_k; + int idx; + zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, + j, nbor_k, offset_k, idx); + numtyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); + numtyp prefactor_jk = zeta_jk.y; int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype]; - { - numtyp4 ts1_jkiparam = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex); - numtyp jkiparam_lam3 = ts1_jkiparam.z; - numtyp jkiparam_powermint = ts1_jkiparam.w; - numtyp4 ts2_jkiparam = ts2[jkiparam]; //fetch4(ts2_jkiparam,jkiparam,ts2_tex); - numtyp jkiparam_bigr = ts2_jkiparam.z; - numtyp jkiparam_bigd = ts2_jkiparam.w; - numtyp4 ts4_jkiparam = ts4[jkiparam]; //fetch4(ts4_jkiparam,jkiparam,ts4_tex); - numtyp jkiparam_c = ts4_jkiparam.x; - numtyp jkiparam_d = ts4_jkiparam.y; - numtyp jkiparam_h = ts4_jkiparam.z; - numtyp jkiparam_gamma = ts4_jkiparam.w; - attractive(jkiparam_bigr, jkiparam_bigd, jkiparam_powermint, - jkiparam_lam3, jkiparam_c, jkiparam_d, jkiparam_h, - jkiparam_gamma, prefactor_jk, r2, r2inv, r1, r1inv, delr2, mdelr1, - fi, fj, fk); - } + ts1_param = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex); + lam3 = ts1_param.z; + powermint = ts1_param.w; + ts2_param = ts2[jkiparam]; //fetch4(ts2_jkiparam,jkiparam,ts2_tex); + bigr = ts2_param.z; + bigd = ts2_param.w; + ts4_param = ts4[jkiparam]; //fetch4(ts4_jkiparam,jkiparam,ts4_tex); + c = ts4_param.x; + d = ts4_param.y; + h = ts4_param.z; + gamma = ts4_param.w; + attractive(bigr, bigd, powermint, lam3, c, d, h, gamma, + prefactor_jk, r2, r2inv, r1, r1inv, delr2, mdelr1, fi, fj, fk); f.x += fk[0]; f.y += fk[1]; f.z += fk[2]; @@ -1088,3 +1064,4 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, #endif } // if ii } + diff --git a/lib/gpu/lal_tersoff.h b/lib/gpu/lal_tersoff.h index 5e760297ba..beae6f5e08 100644 --- a/lib/gpu/lal_tersoff.h +++ b/lib/gpu/lal_tersoff.h @@ -30,25 +30,25 @@ class Tersoff : public BaseThree { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, const int nlocal, const int nall, const int max_nbors, + int init(const int ntypes, const int nlocal, const int nall, const int max_nbors, const double cell_size, const double gpu_split, FILE *screen, - int* host_map, const int nelements, int*** host_elem2param, const int nparams, + int* host_map, const int nelements, int*** host_elem2param, const int nparams, const double* lam1, const double* lam2, const double* lam3, const double* powermint, const double* biga, const double* bigb, const double* bigr, const double* bigd, const double* c1, const double* c2, - const double* c3, const double* c4, const double* c, const double* d, - const double* h, const double* gamma, const double* beta, + const double* c3, const double* c4, const double* c, const double* d, + const double* h, const double* gamma, const double* beta, const double* powern, const double* cutsq); /// Pair loop with host neighboring - void compute(const int f_ago, const int inum_full, const int nall, + void compute(const int f_ago, const int inum_full, const int nall, const int nlist, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -58,10 +58,10 @@ class Tersoff : public BaseThree { int ** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -77,7 +77,7 @@ class Tersoff : public BaseThree { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; /// ts1.x = lam1, ts1.y = lam2, ts1.z = lam3, ts1.w = powermint @@ -97,13 +97,15 @@ class Tersoff : public BaseThree { UCL_D_Vec map; int _nparams,_nelements; - /// Per-atom arrays - UCL_D_Vec _zetaij; + /// Per-atom arrays: + /// zetaij.x = force, zetaij.y = prefactor, zetaij.z = evdwl, + /// zetaij.w = zetaij + UCL_D_Vec _zetaij; UCL_Kernel k_zeta; - UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, zeta_tex; + UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex; - int _max_zij_size, _max_nbors; + int _max_nbors; private: bool _allocated; diff --git a/lib/gpu/lal_tersoff_ext.cpp b/lib/gpu/lal_tersoff_ext.cpp index c33ea6615e..a608a57179 100644 --- a/lib/gpu/lal_tersoff_ext.cpp +++ b/lib/gpu/lal_tersoff_ext.cpp @@ -27,15 +27,15 @@ static Tersoff TSMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, +int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, const double cell_size, int &gpu_mode, FILE *screen, int* host_map, const int nelements, int*** host_elem2param, const int nparams, const double* ts_lam1, const double* ts_lam2, const double* ts_lam3, const double* ts_powermint, const double* ts_biga, const double* ts_bigb, - const double* ts_bigr, const double* ts_bigd, - const double* ts_c1, const double* ts_c2, const double* ts_c3, const double* ts_c4, - const double* ts_c, const double* ts_d, const double* ts_h, - const double* ts_gamma, const double* ts_beta, + const double* ts_bigr, const double* ts_bigd, + const double* ts_c1, const double* ts_c2, const double* ts_c3, const double* ts_c4, + const double* ts_c, const double* ts_d, const double* ts_h, + const double* ts_gamma, const double* ts_beta, const double* ts_powern, const double* ts_cutsq) { TSMF.clear(); gpu_mode=TSMF.device->gpu_mode(); @@ -47,13 +47,9 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int int procs_per_gpu=TSMF.device->procs_per_gpu(); // disable host/device split for now - if (gpu_split != 1.0) + if (gpu_split != 1.0) return -8; - // disable multiple threads per atom for now - if (TSMF.device->threads_per_atom() != 1) - return -10; - TSMF.device->init_message(screen,"tersoff/gpu",first_gpu,last_gpu); bool message=false; @@ -69,9 +65,9 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int if (world_me==0) init_ok=TSMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, - ts_lam1, ts_lam2, ts_lam3, ts_powermint, - ts_biga, ts_bigb, ts_bigr, ts_bigd, - ts_c1, ts_c2, ts_c3, ts_c4, ts_c, ts_d, ts_h, + ts_lam1, ts_lam2, ts_lam3, ts_powermint, + ts_biga, ts_bigb, ts_bigr, ts_bigd, + ts_c1, ts_c2, ts_c3, ts_c4, ts_c, ts_d, ts_h, ts_gamma, ts_beta, ts_powern, ts_cutsq); TSMF.device->world_barrier(); @@ -90,13 +86,13 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int if (gpu_rank==i && world_me!=0) init_ok=TSMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, - ts_lam1, ts_lam2, ts_lam3, ts_powermint, - ts_biga, ts_bigb, ts_bigr, ts_bigd, - ts_c1, ts_c2, ts_c3, ts_c4, ts_c, ts_d, ts_h, + ts_lam1, ts_lam2, ts_lam3, ts_powermint, + ts_biga, ts_bigb, ts_bigr, ts_bigd, + ts_c1, ts_c2, ts_c3, ts_c4, ts_c, ts_d, ts_h, ts_gamma, ts_beta, ts_powern, ts_cutsq); TSMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -121,12 +117,12 @@ int ** tersoff_gpu_compute_n(const int ago, const int inum_full, return TSMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} -void tersoff_gpu_compute(const int ago, const int nlocal, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, +void tersoff_gpu_compute(const int ago, const int nlocal, const int nall, + const int nlist, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { TSMF.compute(ago,nlocal,nall,nlist,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); diff --git a/lib/gpu/lal_tersoff_extra.h b/lib/gpu/lal_tersoff_extra.h index 7a8fc5872a..672a767783 100644 --- a/lib/gpu/lal_tersoff_extra.h +++ b/lib/gpu/lal_tersoff_extra.h @@ -1,7 +1,7 @@ /// ************************************************************************** // tersoff_extra.h // ------------------- -// Trung Dac Nguyen +// Trung Dac Nguyen // // Device code for Tersoff math routines // @@ -26,7 +26,7 @@ /* ---------------------------------------------------------------------- */ -ucl_inline numtyp vec3_dot(const numtyp x[3], const numtyp y[3]) +ucl_inline numtyp vec3_dot(const numtyp x[3], const numtyp y[3]) { return (x[0]*y[0] + x[1]*y[1] + x[2]*y[2]); } @@ -36,12 +36,12 @@ ucl_inline void vec3_add(const numtyp x[3], const numtyp y[3], numtyp z[3]) z[0] = x[0]+y[0]; z[1] = x[1]+y[1]; z[2] = x[2]+y[2]; } -ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3]) +ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3]) { y[0] = k*x[0]; y[1] = k*x[1]; y[2] = k*x[2]; } -ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3], +ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3], const numtyp y[3], numtyp z[3]) { z[0] = k*x[0]+y[0]; z[1] = k*x[1]+y[1]; z[2] = k*x[2]+y[2]; @@ -51,14 +51,14 @@ ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3], ucl_inline numtyp ters_gijk(const numtyp costheta, const numtyp param_c, - const numtyp param_d, + const numtyp param_d, const numtyp param_h, - const numtyp param_gamma) + const numtyp param_gamma) { const numtyp ters_c = param_c * param_c; const numtyp ters_d = param_d * param_d; const numtyp hcth = param_h - costheta; - return param_gamma*((numtyp)1.0 + ters_c*ucl_recip(ters_d) - + return param_gamma*((numtyp)1.0 + ters_c*ucl_recip(ters_d) - ters_c *ucl_recip(ters_d + hcth*hcth)); } @@ -66,9 +66,9 @@ ucl_inline numtyp ters_gijk(const numtyp costheta, ucl_inline numtyp ters_gijk_d(const numtyp costheta, const numtyp param_c, - const numtyp param_d, - const numtyp param_h, - const numtyp param_gamma) + const numtyp param_d, + const numtyp param_h, + const numtyp param_gamma) { const numtyp ters_c = param_c * param_c; const numtyp ters_d = param_d * param_d; @@ -80,12 +80,12 @@ ucl_inline numtyp ters_gijk_d(const numtyp costheta, /* ---------------------------------------------------------------------- */ -ucl_inline void costheta_d(const numtyp rij_hat[3], +ucl_inline void costheta_d(const numtyp rij_hat[3], const numtyp rij, - const numtyp rik_hat[3], + const numtyp rik_hat[3], const numtyp rik, - numtyp *dri, - numtyp *drj, + numtyp *dri, + numtyp *drj, numtyp *drk) { // first element is derivative wrt Ri, second wrt Rj, third wrt Rk @@ -131,85 +131,87 @@ ucl_inline numtyp ters_fa(const numtyp r, const numtyp param_lam2) { if (r > param_bigr + param_bigd) return (numtyp)0.0; - return -param_bigb * ucl_exp(-param_lam2 * r) * + return -param_bigb * ucl_exp(-param_lam2 * r) * ters_fc(r,param_bigr,param_bigd); } /* ---------------------------------------------------------------------- */ ucl_inline numtyp ters_fa_d(const numtyp r, - const numtyp param_bigb, + const numtyp param_bigb, const numtyp param_bigr, const numtyp param_bigd, const numtyp param_lam2) { if (r > param_bigr + param_bigd) return (numtyp)0.0; - return param_bigb * ucl_exp(-param_lam2 * r) * (param_lam2 * + return param_bigb * ucl_exp(-param_lam2 * r) * (param_lam2 * ters_fc(r,param_bigr,param_bigd) - ters_fc_d(r,param_bigr,param_bigd)); } /* ---------------------------------------------------------------------- */ ucl_inline numtyp ters_bij(const numtyp zeta, - const numtyp param_beta, + const numtyp param_beta, const numtyp param_powern, - const numtyp param_c1, - const numtyp param_c2, + const numtyp param_c1, + const numtyp param_c2, const numtyp param_c3, const numtyp param_c4) { numtyp tmp = param_beta * zeta; if (tmp > param_c1) return ucl_rsqrt(tmp); if (tmp > param_c2) - return ((numtyp)1.0 - ucl_powr(tmp,-param_powern) / + return ((numtyp)1.0 - ucl_powr(tmp,-param_powern) / ((numtyp)2.0*param_powern))*ucl_rsqrt(tmp); if (tmp < param_c4) return (numtyp)1.0; if (tmp < param_c3) return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern); - return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern), + return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern), (numtyp)-1.0/((numtyp)2.0*param_powern)); } /* ---------------------------------------------------------------------- */ ucl_inline numtyp ters_bij_d(const numtyp zeta, - const numtyp param_beta, + const numtyp param_beta, const numtyp param_powern, - const numtyp param_c1, - const numtyp param_c2, + const numtyp param_c1, + const numtyp param_c2, const numtyp param_c3, const numtyp param_c4) { numtyp tmp = param_beta * zeta; - if (tmp > param_c1) + if (tmp > param_c1) return param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5); if (tmp > param_c2) return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) * - ((numtyp)1.0 - (numtyp)0.5 * ((numtyp)1.0 + (numtyp)1.0 / - ((numtyp)2.0 * param_powern)) * ucl_powr(tmp,-param_powern))); + // error in negligible 2nd term fixed 9/30/2015 + // (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) * + ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) * + ucl_powr(tmp,-param_powern))); if (tmp < param_c4) return (numtyp)0.0; if (tmp < param_c3) return (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0); numtyp tmp_n = ucl_powr(tmp,param_powern); - return (numtyp)-0.5 * ucl_powr((numtyp)1.0+tmp_n, (numtyp) - + return (numtyp)-0.5 * ucl_powr((numtyp)1.0+tmp_n, (numtyp) - (numtyp)1.0-((numtyp)1.0 / ((numtyp)2.0 * param_powern)))*tmp_n / zeta; } /* ---------------------------------------------------------------------- */ ucl_inline void ters_zetaterm_d(const numtyp prefactor, - const numtyp rij_hat[3], + const numtyp rij_hat[3], const numtyp rij, - const numtyp rik_hat[3], + const numtyp rik_hat[3], const numtyp rik, - const numtyp param_bigr, + const numtyp param_bigr, const numtyp param_bigd, const numtyp param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, - const numtyp param_h, + const numtyp param_h, const numtyp param_gamma, numtyp dri[3], numtyp drj[3], @@ -229,7 +231,7 @@ ucl_inline void ters_zetaterm_d(const numtyp prefactor, else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(tmp); - if ((int)param_powermint == 3) + if ((int)param_powermint == 3) ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr; else ex_delr_d = param_lam3 * ex_delr; @@ -269,17 +271,17 @@ ucl_inline void ters_zetaterm_d(const numtyp prefactor, } ucl_inline void ters_zetaterm_d_fi(const numtyp prefactor, - const numtyp rij_hat[3], + const numtyp rij_hat[3], const numtyp rij, - const numtyp rik_hat[3], + const numtyp rik_hat[3], const numtyp rik, - const numtyp param_bigr, + const numtyp param_bigr, const numtyp param_bigd, const numtyp param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, - const numtyp param_h, + const numtyp param_h, const numtyp param_gamma, numtyp dri[3]) { @@ -297,7 +299,7 @@ ucl_inline void ters_zetaterm_d_fi(const numtyp prefactor, else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(tmp); - if ((int)param_powermint == 3) + if ((int)param_powermint == 3) ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr; else ex_delr_d = param_lam3 * ex_delr; @@ -319,17 +321,17 @@ ucl_inline void ters_zetaterm_d_fi(const numtyp prefactor, } ucl_inline void ters_zetaterm_d_fj(const numtyp prefactor, - const numtyp rij_hat[3], + const numtyp rij_hat[3], const numtyp rij, - const numtyp rik_hat[3], + const numtyp rik_hat[3], const numtyp rik, - const numtyp param_bigr, + const numtyp param_bigr, const numtyp param_bigd, const numtyp param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, - const numtyp param_h, + const numtyp param_h, const numtyp param_gamma, numtyp drj[3]) { @@ -346,7 +348,7 @@ ucl_inline void ters_zetaterm_d_fj(const numtyp prefactor, else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(tmp); - if ((int)param_powermint == 3) + if ((int)param_powermint == 3) ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr; else ex_delr_d = param_lam3 * ex_delr; @@ -365,17 +367,17 @@ ucl_inline void ters_zetaterm_d_fj(const numtyp prefactor, } ucl_inline void ters_zetaterm_d_fk(const numtyp prefactor, - const numtyp rij_hat[3], + const numtyp rij_hat[3], const numtyp rij, - const numtyp rik_hat[3], + const numtyp rik_hat[3], const numtyp rik, - const numtyp param_bigr, + const numtyp param_bigr, const numtyp param_bigd, const numtyp param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, - const numtyp param_h, + const numtyp param_h, const numtyp param_gamma, numtyp drk[3]) { @@ -393,7 +395,7 @@ ucl_inline void ters_zetaterm_d_fk(const numtyp prefactor, else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(tmp); - if ((int)param_powermint == 3) + if ((int)param_powermint == 3) ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr; else ex_delr_d = param_lam3 * ex_delr; @@ -419,20 +421,20 @@ ucl_inline void repulsive(const numtyp param_bigr, const numtyp param_bigd, const numtyp param_lam1, const numtyp param_biga, - const numtyp rsq, - const int eflag, + const numtyp rsq, + const int eflag, numtyp *ans) -{ - numtyp r,tmp_fc,tmp_fc_d,tmp_exp; - r = ucl_sqrt(rsq); - tmp_fc = ters_fc(r,param_bigr,param_bigd); - tmp_fc_d = ters_fc_d(r,param_bigr,param_bigd); - tmp_exp = ucl_exp(-param_lam1 * r); +{ + numtyp r,tmp_fc,tmp_fc_d,tmp_exp; + r = ucl_sqrt(rsq); + tmp_fc = ters_fc(r,param_bigr,param_bigd); + tmp_fc_d = ters_fc_d(r,param_bigr,param_bigd); + tmp_exp = ucl_exp(-param_lam1 * r); // fforce - ans[0] = -param_biga*tmp_exp*(tmp_fc_d - tmp_fc*param_lam1)*ucl_recip(r); + ans[0] = -param_biga*tmp_exp*(tmp_fc_d - tmp_fc*param_lam1)*ucl_recip(r); // eng - if (eflag) ans[1] = tmp_fc * param_biga * tmp_exp; -} + if (eflag) ans[1] = tmp_fc * param_biga * tmp_exp; +} /* ---------------------------------------------------------------------- */ @@ -441,7 +443,7 @@ ucl_inline numtyp zeta(const numtyp param_powermint, const numtyp param_bigr, const numtyp param_bigd, const numtyp param_c, - const numtyp param_d, + const numtyp param_d, const numtyp param_h, const numtyp param_gamma, const numtyp rsqij, @@ -464,23 +466,23 @@ ucl_inline numtyp zeta(const numtyp param_powermint, else if (arg < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(arg); - return ters_fc(rik,param_bigr,param_bigd) * + return ters_fc(rik,param_bigr,param_bigd) * ters_gijk(costheta,param_c, param_d, param_h, param_gamma) * ex_delr; } /* ---------------------------------------------------------------------- */ -ucl_inline void force_zeta(const numtyp param_bigb, +ucl_inline void force_zeta(const numtyp param_bigb, const numtyp param_bigr, const numtyp param_bigd, const numtyp param_lam2, - const numtyp param_beta, + const numtyp param_beta, const numtyp param_powern, - const numtyp param_c1, - const numtyp param_c2, + const numtyp param_c1, + const numtyp param_c2, const numtyp param_c3, const numtyp param_c4, - const numtyp rsq, + const numtyp rsq, const numtyp zeta_ij, const int eflag, numtyp fpfeng[4]) @@ -494,7 +496,7 @@ ucl_inline void force_zeta(const numtyp param_bigb, param_c1,param_c2, param_c3, param_c4); fpfeng[0] = (numtyp)0.5*bij*fa_d * ucl_recip(r); // fforce fpfeng[1] = (numtyp)-0.5*fa * ters_bij_d(zeta_ij,param_beta, param_powern, - param_c1,param_c2, param_c3, param_c4); // prefactor + param_c1,param_c2, param_c3, param_c4); // prefactor if (eflag) fpfeng[2] = (numtyp)0.5*bij*fa; // eng } @@ -504,23 +506,23 @@ ucl_inline void force_zeta(const numtyp param_bigb, use param_ijk cutoff for rik test ------------------------------------------------------------------------- */ -ucl_inline void attractive(const numtyp param_bigr, +ucl_inline void attractive(const numtyp param_bigr, const numtyp param_bigd, const numtyp param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, - const numtyp param_h, + const numtyp param_h, const numtyp param_gamma, const numtyp prefactor, - const numtyp rij, - const numtyp rijinv, + const numtyp rij, + const numtyp rijinv, const numtyp rik, - const numtyp rikinv, - const numtyp delrij[3], + const numtyp rikinv, + const numtyp delrij[3], const numtyp delrik[3], - numtyp fi[3], - numtyp fj[3], + numtyp fi[3], + numtyp fj[3], numtyp fk[3]) { numtyp rij_hat[3],rik_hat[3]; @@ -531,20 +533,20 @@ ucl_inline void attractive(const numtyp param_bigr, param_c, param_d, param_h, param_gamma, fi, fj, fk); } -ucl_inline void attractive_fi(const numtyp param_bigr, +ucl_inline void attractive_fi(const numtyp param_bigr, const numtyp param_bigd, const numtyp param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, - const numtyp param_h, + const numtyp param_h, const numtyp param_gamma, const numtyp prefactor, - const numtyp rij, - const numtyp rijinv, + const numtyp rij, + const numtyp rijinv, const numtyp rik, - const numtyp rikinv, - const numtyp delrij[3], + const numtyp rikinv, + const numtyp delrij[3], const numtyp delrik[3], numtyp fi[3]) { @@ -556,20 +558,20 @@ ucl_inline void attractive_fi(const numtyp param_bigr, param_c, param_d, param_h, param_gamma, fi); } -ucl_inline void attractive_fj(const numtyp param_bigr, +ucl_inline void attractive_fj(const numtyp param_bigr, const numtyp param_bigd, const numtyp param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, - const numtyp param_h, + const numtyp param_h, const numtyp param_gamma, const numtyp prefactor, - const numtyp rij, - const numtyp rijinv, + const numtyp rij, + const numtyp rijinv, const numtyp rik, - const numtyp rikinv, - const numtyp delrij[3], + const numtyp rikinv, + const numtyp delrij[3], const numtyp delrik[3], numtyp fj[3]) { @@ -581,20 +583,20 @@ ucl_inline void attractive_fj(const numtyp param_bigr, param_c, param_d, param_h, param_gamma, fj); } -ucl_inline void attractive_fk(const numtyp param_bigr, +ucl_inline void attractive_fk(const numtyp param_bigr, const numtyp param_bigd, const numtyp param_powermint, const numtyp param_lam3, const numtyp param_c, const numtyp param_d, - const numtyp param_h, + const numtyp param_h, const numtyp param_gamma, const numtyp prefactor, - const numtyp rij, - const numtyp rijinv, + const numtyp rij, + const numtyp rijinv, const numtyp rik, - const numtyp rikinv, - const numtyp delrij[3], + const numtyp rikinv, + const numtyp delrij[3], const numtyp delrik[3], numtyp fk[3]) {