From ca88f97a4bf6eec438c8719751aca705c131819c Mon Sep 17 00:00:00 2001 From: Gurgen Date: Fri, 12 Mar 2021 01:40:52 +0300 Subject: [PATCH] added acceleration of lj/smooth on gpu --- lib/gpu/lal_lj_smooth.cpp | 19 +++++++++++++------ lib/gpu/lal_lj_smooth.cu | 10 ++++++---- lib/gpu/lal_lj_smooth.h | 10 ++++++---- lib/gpu/lal_lj_smooth_ext.cpp | 12 ++++++------ src/GPU/pair_lj_smooth_gpu.cpp | 12 ++++++------ 5 files changed, 37 insertions(+), 26 deletions(-) diff --git a/lib/gpu/lal_lj_smooth.cpp b/lib/gpu/lal_lj_smooth.cpp index 5c75539660..d2ab63549d 100644 --- a/lib/gpu/lal_lj_smooth.cpp +++ b/lib/gpu/lal_lj_smooth.cpp @@ -16,7 +16,7 @@ #if defined(USE_OPENCL) #include "lj_smooth_cl.h" #elif defined(USE_CUDART) -const char *lj=0; +const char *lj_smooth=0; #else #include "lj_smooth_cubin.h" #endif @@ -51,7 +51,7 @@ int LJSMOOTHT::init(const int ntypes, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, - double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, double **host_ljsw4, double **cut_inner, double **cut_inner_sq) { int success; @@ -88,6 +88,10 @@ int LJSMOOTHT::init(const int ntypes, ljsw.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,ljsw,host_write,host_ljsw1,host_ljsw2, host_ljsw3,host_ljsw4); + + ljsw0.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,ljsw0,host_write,host_ljsw0,cut_inner,host_ljsw2, + host_ljsw3); UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -95,7 +99,7 @@ int LJSMOOTHT::init(const int ntypes, ucl_copy(sp_lj,dview,false); _allocated=true; - this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+ljsw.row_bytes()+sp_lj.row_bytes(); + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+ljsw.row_bytes()+ljsw0.row_bytes()+sp_lj.row_bytes(); return 0; } @@ -103,7 +107,7 @@ template void LJSMOOTHT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, - double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, double **host_ljsw4, double **cut_inner, double **cut_inner_sq) { // Allocate a host write buffer for data initialization @@ -119,6 +123,8 @@ void LJSMOOTHT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, host_offset, cut_inner); this->atom->type_pack4(ntypes,_lj_types,ljsw,host_write,host_ljsw1,host_ljsw2, host_ljsw3,host_ljsw4); + this->atom->type_pack4(ntypes,_lj_types,ljsw0,host_write,host_ljsw0, cut_inner, host_ljsw2, + host_ljsw3); } template @@ -130,6 +136,7 @@ void LJSMOOTHT::clear() { lj1.clear(); lj3.clear(); ljsw.clear(); + ljsw0.clear(); sp_lj.clear(); this->clear_atomic(); } @@ -165,14 +172,14 @@ void LJSMOOTHT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw, &sp_lj, + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw, &ljsw0, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &_lj_types, &sp_lj, + this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &ljsw0, &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); diff --git a/lib/gpu/lal_lj_smooth.cu b/lib/gpu/lal_lj_smooth.cu index b69f8b9388..345da1c702 100644 --- a/lib/gpu/lal_lj_smooth.cu +++ b/lib/gpu/lal_lj_smooth.cu @@ -28,6 +28,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, const __global numtyp4 *restrict lj3, const __global numtyp4 *restrict ljsw, + const __global numtyp4 *restrict ljsw0, const int lj_types, const __global numtyp *restrict sp_lj, const __global int * dev_nbor, @@ -82,7 +83,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_, } else { r = sqrt(rsq); - t = r - lj3[mtype].w; + t = r - ljsw0[mtype].y; tsq = t*t; fskin = ljsw[mtype].x + ljsw[mtype].y*t + ljsw[mtype].z*tsq + ljsw[mtype].w*tsq*t; @@ -98,7 +99,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_, if (rsq < lj1[mtype].w) e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z; else - e = ljsw[mtype].x - ljsw[mtype].x*t - + e = ljsw0[mtype].x - ljsw[mtype].x*t - ljsw[mtype].y*tsq/2.0 - ljsw[mtype].z*tsq*t/3.0 - ljsw[mtype].z*tsq*tsq/4.0 - lj3[mtype].z; @@ -125,6 +126,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, const __global numtyp4 *restrict lj3_in, const __global numtyp4 *restrict ljsw, + const __global numtyp4 *restrict ljsw0, const __global numtyp *restrict sp_lj_in, const __global int * dev_nbor, const __global int * dev_packed, @@ -191,7 +193,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_, } else { r = sqrt(rsq); - t = r - lj3[mtype].w; + t = r - ljsw0[mtype].y; //? //printf("%f\n", r - lj3[mtype].w); tsq = t*t; fskin = ljsw[mtype].x + ljsw[mtype].y*t + @@ -208,7 +210,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_, if (rsq < lj1[mtype].w) e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z; else - e = ljsw[mtype].x - ljsw[mtype].x*t - + e = ljsw0[mtype].x - ljsw[mtype].x*t - ljsw[mtype].y*tsq/2.0 - ljsw[mtype].z*tsq*t/3.0 - ljsw[mtype].z*tsq*tsq/4.0 - lj3[mtype].z; //??? diff --git a/lib/gpu/lal_lj_smooth.h b/lib/gpu/lal_lj_smooth.h index 98dbad87b7..0a4cdf78eb 100644 --- a/lib/gpu/lal_lj_smooth.h +++ b/lib/gpu/lal_lj_smooth.h @@ -43,16 +43,16 @@ class LJSMOOTH : public BaseAtomic { const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, - double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, - double **host_ljsw4, + double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, + double **host_ljsw3, double **host_ljsw4, double **cut_inner, double **cut_inner_sq); /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, - double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, - double **host_ljsw4, + double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, + double **host_ljsw3, double **host_ljsw4, double **cut_inner, double **cut_inner_sq); /// Clear all host and device data @@ -73,6 +73,8 @@ class LJSMOOTH : public BaseAtomic { UCL_D_Vec lj3; /// ljsw.x = ljsw1, ljsw.y = ljsw2, ljsw.z = ljsw3, ljsw.w = ljsw4 UCL_D_Vec ljsw; + /// ljsw0 + UCL_D_Vec ljsw0; /// Special LJ values UCL_D_Vec sp_lj; diff --git a/lib/gpu/lal_lj_smooth_ext.cpp b/lib/gpu/lal_lj_smooth_ext.cpp index 92624bf7fa..fd4dfd46be 100644 --- a/lib/gpu/lal_lj_smooth_ext.cpp +++ b/lib/gpu/lal_lj_smooth_ext.cpp @@ -32,7 +32,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, - double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, double **host_ljsw4, double **cut_inner, double **cut_inner_sq) { LJSMTMF.clear(); gpu_mode=LJSMTMF.device->gpu_mode(); @@ -59,7 +59,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1, init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, - host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq); + host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq); LJSMTMF.device->world_barrier(); if (message) @@ -77,7 +77,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1, if (gpu_rank==i && world_me!=0) init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, - cell_size, gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, + cell_size, gpu_split, screen, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq); LJSMTMF.device->gpu_barrier(); @@ -97,19 +97,19 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1, // --------------------------------------------------------------------------- void ljsmt_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **offset, double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, double **host_ljsw4, double **cut_inner, double **cut_inner_sq) { int world_me=LJSMTMF.device->world_me(); int gpu_rank=LJSMTMF.device->gpu_rank(); int procs_per_gpu=LJSMTMF.device->procs_per_gpu(); if (world_me==0) - LJSMTMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq); + LJSMTMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq); LJSMTMF.device->world_barrier(); for (int i=0; igpu_barrier(); } } diff --git a/src/GPU/pair_lj_smooth_gpu.cpp b/src/GPU/pair_lj_smooth_gpu.cpp index 882055e84d..0203350507 100644 --- a/src/GPU/pair_lj_smooth_gpu.cpp +++ b/src/GPU/pair_lj_smooth_gpu.cpp @@ -45,15 +45,15 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **offset, double *special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, - double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, - double **host_ljsw4, + double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, + double **host_ljsw3, double **host_ljsw4, double **cut_inner, double **cut_innersq); void ljsmt_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, - double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, - double **host_ljsw4, + double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, + double **host_ljsw3, double **host_ljsw4, double **cut_inner, double **cut_innersq); void ljsmt_gpu_clear(); @@ -174,7 +174,7 @@ void PairLJSmoothGPU::init_style() int success = ljsmt_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen, ljsw1, ljsw2, + cell_size, gpu_mode, screen, ljsw0, ljsw1, ljsw2, ljsw3, ljsw4, cut_inner, cut_inner_sq); GPU_EXTRA::check_flag(success,error,world); @@ -191,7 +191,7 @@ void PairLJSmoothGPU::reinit() { Pair::reinit(); - ljsmt_gpu_reinit(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, ljsw1, ljsw2, ljsw3, ljsw4, cut_inner, cut_inner_sq); + ljsmt_gpu_reinit(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, ljsw0, ljsw1, ljsw2, ljsw3, ljsw4, cut_inner, cut_inner_sq); } /* ---------------------------------------------------------------------- */