diff --git a/lib/gpu/lal_lj_smooth.cpp b/lib/gpu/lal_lj_smooth.cpp index 59ebd0f636..4457ce79d2 100644 --- a/lib/gpu/lal_lj_smooth.cpp +++ b/lib/gpu/lal_lj_smooth.cpp @@ -51,16 +51,31 @@ int LJSMOOTHT::init(const int ntypes, double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, double **host_ljsw4, double **cut_inner, double **cut_inner_sq) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj_smooth,"k_lj_smooth"); + _screen,lj_smooth,"k_lj_smooth",onetype); if (success!=0) return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - int max_shared_types=this->device->max_shared_types(); if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { lj_types=max_shared_types; shared_types=true; @@ -145,19 +160,9 @@ double LJSMOOTHT::host_memory_usage() const { // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template -int LJSMOOTHT::loop(const int _eflag, const int _vflag) { +int LJSMOOTHT::loop(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -166,8 +171,8 @@ int LJSMOOTHT::loop(const int _eflag, const int _vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw, &ljsw0, &sp_lj, + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &ljsw, &ljsw0, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, @@ -180,6 +185,7 @@ int LJSMOOTHT::loop(const int _eflag, const int _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); + return GX; } template class LJSMOOTH; diff --git a/lib/gpu/lal_lj_smooth.cu b/lib/gpu/lal_lj_smooth.cu index fa87e6fcee..346395513c 100644 --- a/lib/gpu/lal_lj_smooth.cu +++ b/lib/gpu/lal_lj_smooth.cu @@ -40,16 +40,20 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - acctyp energy=(acctyp)0; + int n_stride; + local_allocate_store_pair(); + acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e; if (rsq < lj1[mtype].w) e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z; @@ -108,7 +112,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_, //numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -119,9 +123,9 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_, @@ -139,6 +143,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); + #ifndef ONETYPE __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; @@ -146,40 +151,60 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_, sp_lj[tid]=sp_lj_in[tid]; if (tid0) + if (EVFLAG && eflag) lj3[tid]=lj3_in[tid]; } + __syncthreads(); + #else + const numtyp lj1x=lj1_in[ONETYPE].x; + const numtyp lj1y=lj1_in[ONETYPE].y; + const numtyp cutsq=lj1_in[ONETYPE].z; + numtyp lj3x, lj3y, lj3z; + if (EVFLAG && eflag) { + lj3x=lj3_in[ONETYPE].x; + lj3y=lj3_in[ONETYPE].y; + lj3z=lj3_in[ONETYPE].z; + } + #endif + + int n_stride; + local_allocate_store_pair(); - acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; - - __syncthreads(); + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } if (ii0) { + if (EVFLAG && eflag) { numtyp e; if (rsq < lj1[mtype].w) e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z; @@ -218,7 +243,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_, energy+=factor_lj*e; } - if (vflag>0) { + if (EVFLAG && vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; @@ -229,7 +254,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_, } } // for nbor - store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, - ans,engv); } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); } diff --git a/lib/gpu/lal_lj_smooth_ext.cpp b/lib/gpu/lal_lj_smooth_ext.cpp index aaebbe1493..7312c8b257 100644 --- a/lib/gpu/lal_lj_smooth_ext.cpp +++ b/lib/gpu/lal_lj_smooth_ext.cpp @@ -57,7 +57,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq); @@ -76,7 +76,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, + offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq); diff --git a/src/GPU/pair_lj_smooth_gpu.cpp b/src/GPU/pair_lj_smooth_gpu.cpp index 0203350507..282e189180 100644 --- a/src/GPU/pair_lj_smooth_gpu.cpp +++ b/src/GPU/pair_lj_smooth_gpu.cpp @@ -171,9 +171,10 @@ void PairLJSmoothGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; int success = ljsmt_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, + atom->nlocal+atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, ljsw0, ljsw1, ljsw2, ljsw3, ljsw4, cut_inner, cut_inner_sq); GPU_EXTRA::check_flag(success,error,world);