added acceleration of lj/smooth on gpu

This commit is contained in:
Gurgen
2021-03-12 01:40:52 +03:00
parent e32d059d26
commit ca88f97a4b
5 changed files with 37 additions and 26 deletions

View File

@ -16,7 +16,7 @@
#if defined(USE_OPENCL)
#include "lj_smooth_cl.h"
#elif defined(USE_CUDART)
const char *lj=0;
const char *lj_smooth=0;
#else
#include "lj_smooth_cubin.h"
#endif
@ -51,7 +51,7 @@ int LJSMOOTHT::init(const int ntypes,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
double **host_ljsw4,
double **cut_inner, double **cut_inner_sq) {
int success;
@ -89,13 +89,17 @@ int LJSMOOTHT::init(const int ntypes,
this->atom->type_pack4(ntypes,lj_types,ljsw,host_write,host_ljsw1,host_ljsw2,
host_ljsw3,host_ljsw4);
ljsw0.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,ljsw0,host_write,host_ljsw0,cut_inner,host_ljsw2,
host_ljsw3);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+ljsw.row_bytes()+sp_lj.row_bytes();
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+ljsw.row_bytes()+ljsw0.row_bytes()+sp_lj.row_bytes();
return 0;
}
@ -103,7 +107,7 @@ template <class numtyp, class acctyp>
void LJSMOOTHT::reinit(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
double **host_ljsw4,
double **cut_inner, double **cut_inner_sq) {
// Allocate a host write buffer for data initialization
@ -119,6 +123,8 @@ void LJSMOOTHT::reinit(const int ntypes, double **host_cutsq, double **host_lj1,
host_offset, cut_inner);
this->atom->type_pack4(ntypes,_lj_types,ljsw,host_write,host_ljsw1,host_ljsw2,
host_ljsw3,host_ljsw4);
this->atom->type_pack4(ntypes,_lj_types,ljsw0,host_write,host_ljsw0, cut_inner, host_ljsw2,
host_ljsw3);
}
template <class numtyp, class acctyp>
@ -130,6 +136,7 @@ void LJSMOOTHT::clear() {
lj1.clear();
lj3.clear();
ljsw.clear();
ljsw0.clear();
sp_lj.clear();
this->clear_atomic();
}
@ -165,14 +172,14 @@ void LJSMOOTHT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw, &sp_lj,
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw, &ljsw0, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &_lj_types, &sp_lj,
this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &ljsw0, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);

View File

@ -28,6 +28,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const __global numtyp4 *restrict ljsw,
const __global numtyp4 *restrict ljsw0,
const int lj_types,
const __global numtyp *restrict sp_lj,
const __global int * dev_nbor,
@ -82,7 +83,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
}
else {
r = sqrt(rsq);
t = r - lj3[mtype].w;
t = r - ljsw0[mtype].y;
tsq = t*t;
fskin = ljsw[mtype].x + ljsw[mtype].y*t +
ljsw[mtype].z*tsq + ljsw[mtype].w*tsq*t;
@ -98,7 +99,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
if (rsq < lj1[mtype].w)
e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z;
else
e = ljsw[mtype].x - ljsw[mtype].x*t -
e = ljsw0[mtype].x - ljsw[mtype].x*t -
ljsw[mtype].y*tsq/2.0 - ljsw[mtype].z*tsq*t/3.0 -
ljsw[mtype].z*tsq*tsq/4.0 - lj3[mtype].z;
@ -125,6 +126,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp4 *restrict ljsw,
const __global numtyp4 *restrict ljsw0,
const __global numtyp *restrict sp_lj_in,
const __global int * dev_nbor,
const __global int * dev_packed,
@ -191,7 +193,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
}
else {
r = sqrt(rsq);
t = r - lj3[mtype].w;
t = r - ljsw0[mtype].y; //?
//printf("%f\n", r - lj3[mtype].w);
tsq = t*t;
fskin = ljsw[mtype].x + ljsw[mtype].y*t +
@ -208,7 +210,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
if (rsq < lj1[mtype].w)
e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z;
else
e = ljsw[mtype].x - ljsw[mtype].x*t -
e = ljsw0[mtype].x - ljsw[mtype].x*t -
ljsw[mtype].y*tsq/2.0 - ljsw[mtype].z*tsq*t/3.0 -
ljsw[mtype].z*tsq*tsq/4.0 - lj3[mtype].z; //???

View File

@ -43,16 +43,16 @@ class LJSMOOTH : public BaseAtomic<numtyp, acctyp> {
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
double **host_ljsw4,
double **host_ljsw0, double **host_ljsw1, double **host_ljsw2,
double **host_ljsw3, double **host_ljsw4,
double **cut_inner, double **cut_inner_sq);
/// Send updated coeffs from host to device (to be compatible with fix adapt)
void reinit(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
double **host_ljsw4,
double **host_ljsw0, double **host_ljsw1, double **host_ljsw2,
double **host_ljsw3, double **host_ljsw4,
double **cut_inner, double **cut_inner_sq);
/// Clear all host and device data
@ -73,6 +73,8 @@ class LJSMOOTH : public BaseAtomic<numtyp, acctyp> {
UCL_D_Vec<numtyp4> lj3;
/// ljsw.x = ljsw1, ljsw.y = ljsw2, ljsw.z = ljsw3, ljsw.w = ljsw4
UCL_D_Vec<numtyp4> ljsw;
/// ljsw0
UCL_D_Vec<numtyp4> ljsw0;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;

View File

@ -32,7 +32,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
double **host_ljsw4, double **cut_inner, double **cut_inner_sq) {
LJSMTMF.clear();
gpu_mode=LJSMTMF.device->gpu_mode();
@ -59,7 +59,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen,
host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq);
host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq);
LJSMTMF.device->world_barrier();
if (message)
@ -77,7 +77,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
if (gpu_rank==i && world_me!=0)
init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3,
cell_size, gpu_split, screen, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3,
host_ljsw4, cut_inner, cut_inner_sq);
LJSMTMF.device->gpu_barrier();
@ -97,19 +97,19 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
// ---------------------------------------------------------------------------
void ljsmt_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
double **offset, double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
double **host_ljsw4, double **cut_inner, double **cut_inner_sq) {
int world_me=LJSMTMF.device->world_me();
int gpu_rank=LJSMTMF.device->gpu_rank();
int procs_per_gpu=LJSMTMF.device->procs_per_gpu();
if (world_me==0)
LJSMTMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq);
LJSMTMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq);
LJSMTMF.device->world_barrier();
for (int i=0; i<procs_per_gpu; i++) {
if (gpu_rank==i && world_me!=0)
LJSMTMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq);
LJSMTMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq);
LJSMTMF.device->gpu_barrier();
}
}

View File

@ -45,15 +45,15 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **offset, double *special_lj, const int nlocal,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
double **host_ljsw4,
double **host_ljsw0, double **host_ljsw1, double **host_ljsw2,
double **host_ljsw3, double **host_ljsw4,
double **cut_inner, double **cut_innersq);
void ljsmt_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset,
double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
double **host_ljsw4,
double **host_ljsw0, double **host_ljsw1, double **host_ljsw2,
double **host_ljsw3, double **host_ljsw4,
double **cut_inner, double **cut_innersq);
void ljsmt_gpu_clear();
@ -174,7 +174,7 @@ void PairLJSmoothGPU::init_style()
int success = ljsmt_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen, ljsw1, ljsw2,
cell_size, gpu_mode, screen, ljsw0, ljsw1, ljsw2,
ljsw3, ljsw4, cut_inner, cut_inner_sq);
GPU_EXTRA::check_flag(success,error,world);
@ -191,7 +191,7 @@ void PairLJSmoothGPU::reinit()
{
Pair::reinit();
ljsmt_gpu_reinit(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, ljsw1, ljsw2, ljsw3, ljsw4, cut_inner, cut_inner_sq);
ljsmt_gpu_reinit(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, ljsw0, ljsw1, ljsw2, ljsw3, ljsw4, cut_inner, cut_inner_sq);
}
/* ---------------------------------------------------------------------- */