From bbffbe06a229a590abed9ba91fe850f1b00c71b5 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 13 Jan 2014 15:51:54 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@11230 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/Nvidia.makefile | 18 +- lib/gpu/Opencl.makefile | 15 +- lib/gpu/lal_answer.cpp | 35 ++-- lib/gpu/lal_buck_coul_long.cpp | 1 + lib/gpu/lal_coul_dsf.cpp | 2 +- lib/gpu/lal_coul_dsf.h | 2 +- lib/gpu/lal_sw.cpp | 172 ++++++++++++----- lib/gpu/lal_sw.cu | 327 ++++++++++++++++++++++----------- lib/gpu/lal_sw.h | 32 +++- lib/gpu/lal_sw_ext.cpp | 27 +-- 10 files changed, 438 insertions(+), 193 deletions(-) diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index a5df7bf23d..e93805df3b 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -64,7 +64,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \ $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \ $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \ $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \ - $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o + $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \ + $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \ @@ -109,7 +110,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \ $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \ $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \ - $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h + $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \ + $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h all: $(OBJ_DIR) $(GPU_LIB) $(EXECS) @@ -644,6 +646,18 @@ $(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(O $(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h $(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/lj_gromacs.cubin: lal_lj_gromacs.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_gromacs.cu + +$(OBJ_DIR)/lj_gromacs_cubin.h: $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs.cubin + $(BIN2C) -c -n lj_gromacs $(OBJ_DIR)/lj_gromacs.cubin > $(OBJ_DIR)/lj_gromacs_cubin.h + +$(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_DIR)/lj_gromacs_cubin.h $(OBJ_DIR)/lal_base_atomic.o + $(CUDR) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h + $(CUDR) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR) + $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H) $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 81e80588cd..bb528f1e2b 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -53,7 +53,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \ $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \ $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \ $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \ - $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o + $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \ + $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \ @@ -75,7 +76,8 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \ $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \ $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \ - $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h + $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \ + $(OBJ_DIR)/lj_gromacs_cl.h OCL_EXECS = $(BIN_DIR)/ocl_get_devices @@ -460,6 +462,15 @@ $(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $( $(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h $(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/lj_gromacs_cl.h: lal_lj_gromacs.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh lj_gromacs $(PRE1_H) lal_lj_gromacs.cu $(OBJ_DIR)/lj_gromacs_cl.h; + +$(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lal_base_atomic.o + $(OCL) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h + $(OCL) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR) + $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp index 7dbd875cd1..ddf893e4ed 100644 --- a/lib/gpu/lal_answer.cpp +++ b/lib/gpu/lal_answer.cpp @@ -150,7 +150,7 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag, csize-=_e_fields; if (!vflag) csize-=6; - + if (csize>0) engv.update_host(_inum*csize,true); if (_rot) @@ -194,12 +194,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom, for (int i=vstart; iclear_atomic(); } diff --git a/lib/gpu/lal_coul_dsf.cpp b/lib/gpu/lal_coul_dsf.cpp index 4f338eb051..ca81d32b2d 100644 --- a/lib/gpu/lal_coul_dsf.cpp +++ b/lib/gpu/lal_coul_dsf.cpp @@ -10,7 +10,7 @@ __________________________________________________________________________ begin : 8/15/2012 - email : nguyentdw@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #if defined(USE_OPENCL) diff --git a/lib/gpu/lal_coul_dsf.h b/lib/gpu/lal_coul_dsf.h index 0579ab46d5..0c5b063026 100644 --- a/lib/gpu/lal_coul_dsf.h +++ b/lib/gpu/lal_coul_dsf.h @@ -10,7 +10,7 @@ __________________________________________________________________________ begin : 8/15/2012 - email : nguyentdw@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #ifndef LAL_LJ_DSF_H diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp index bf6f820b69..27974874b0 100644 --- a/lib/gpu/lal_sw.cpp +++ b/lib/gpu/lal_sw.cpp @@ -16,7 +16,7 @@ #if defined(USE_OPENCL) #include "sw_cl.h" #elif defined(USE_CUDART) -const char *lj=0; +const char *sw=0; #else #include "sw_cubin.h" #endif @@ -43,28 +43,15 @@ int SWT::bytes_per_atom(const int max_nbors) const { } template -int SWT::init(const int nlocal, const int nall, const int max_nbors, - const double cell_size, const double gpu_split, FILE *_screen, - const double epsilon, const double sigma, - const double lambda, const double gamma, - const double costheta, const double biga, - const double bigb, const double powerp, - const double powerq, const double cut, const double cutsq) { - - sw_epsilon=static_cast(epsilon); - sw_sigma=static_cast(sigma); - sw_lambda=static_cast(lambda); - sw_gamma=static_cast(gamma); - sw_costheta=static_cast(costheta); - sw_biga=static_cast(biga); - sw_bigb=static_cast(bigb); - sw_powerp=static_cast(powerp); - sw_powerq=static_cast(powerq); - sw_cut=static_cast(cut); - sw_cutsq=static_cast(cutsq); - if (sw_cutsq>=sw_cut*sw_cut) - sw_cutsq=sw_cut*sw_cut-1e-4; - +int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_nbors, + const double cell_size, const double gpu_split, FILE *_screen, + int* host_map, const int nelements, int*** host_elem2param, const int nparams, + const double* epsilon, const double* sigma, + const double* lambda, const double* gamma, + const double* costheta, const double* biga, + const double* bigb, const double* powerp, + const double* powerq, const double* cut, const double* cutsq) +{ int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,sw,"k_sw","k_sw_three_center", @@ -73,10 +60,93 @@ int SWT::init(const int nlocal, const int nall, const int max_nbors, return success; // If atom type constants fit in shared memory use fast kernel - shared_types=true; + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + _nparams = nparams; + _nelements = nelements; + + UCL_H_Vec dview(nparams,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + + for (int i=0; i(epsilon[i]); + dview[i].y=static_cast(sigma[i]); + dview[i].z=static_cast(lambda[i]); + dview[i].w=static_cast(gamma[i]); + } + + ucl_copy(sw1,dview,false); + sw1_tex.get_texture(*(this->pair_program),"sw1_tex"); + sw1_tex.bind_float(sw1,4); + + sw2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); + + for (int i=0; i(biga[i]); + dview[i].y=static_cast(bigb[i]); + dview[i].z=static_cast(powerp[i]); + dview[i].w=static_cast(powerq[i]); + } + + ucl_copy(sw2,dview,false); + sw2_tex.get_texture(*(this->pair_program),"sw2_tex"); + sw2_tex.bind_float(sw2,4); + + sw3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); + + for (int i=0; i(cut[i]); + dview[i].y=static_cast(cutsq[i]); + dview[i].z=static_cast(costheta[i]); + dview[i].w=(numtyp)0; + } + + ucl_copy(sw3,dview,false); + sw3_tex.get_texture(*(this->pair_program),"sw3_tex"); + sw3_tex.bind_float(sw3,4); + + UCL_H_Vec dview_elem2param(nelements*nelements*nelements, + *(this->ucl_device), UCL_WRITE_ONLY); + + elem2param.alloc(nelements*nelements*nelements,*(this->ucl_device), + UCL_READ_ONLY); + + for (int i = 0; i < nelements; i++) + for (int j = 0; j < nelements; j++) + for (int k = 0; k < nelements; k++) { + int idx = i*nelements*nelements+j*nelements+k; + dview_elem2param[idx] = host_elem2param[i][j][k]; + } + + ucl_copy(elem2param,dview_elem2param,false); + + UCL_H_Vec dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < lj_types; i++) + dview_map[i] = host_map[i]; + + map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(map,dview_map,false); _allocated=true; - this->_max_bytes=0; + this->_max_bytes=sw1.row_bytes()+sw2.row_bytes()+sw3.row_bytes()+ + map.row_bytes()+elem2param.row_bytes(); return 0; } @@ -86,6 +156,11 @@ void SWT::clear() { return; _allocated=false; + sw1.clear(); + sw2.clear(); + sw3.clear(); + map.clear(); + elem2param.clear(); this->clear_atomic(); } @@ -121,22 +196,23 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), &this->ans->force, - &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, - &this->_threads_per_atom, &sw_cut, &sw_epsilon, &sw_sigma, - &sw_biga, &sw_bigb, &sw_powerp, &sw_powerq, &sw_cutsq); + this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3, + &map, &elem2param, &_nelements, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom); BX=this->block_size(); GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/(KTHREADS*JTHREADS)))); this->k_three_center.set_size(GX,BX); - this->k_three_center.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), &this->ans->force, - &this->ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &evatom, - &sw_cut, &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma, - &sw_costheta, &sw_cutsq); + this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3, + &map, &elem2param, &_nelements, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->_threads_per_atom, &evatom); + Answer *end_ans; #ifdef THREE_CONCURRENT end_ans=this->ans2; @@ -145,20 +221,20 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { #endif if (evatom!=0) { this->k_three_end_vatom.set_size(GX,BX); - this->k_three_end_vatom.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), &end_ans->force, - &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &sw_cut, - &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma, - &sw_costheta, &sw_cutsq); + this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3, + &map, &elem2param, &_nelements, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->_threads_per_atom); + } else { this->k_three_end.set_size(GX,BX); - this->k_three_end.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), &end_ans->force, - &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &sw_cut, - &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma, - &sw_costheta, &sw_cutsq); + this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3, + &map, &elem2param, &_nelements, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->_threads_per_atom); + } this->time_pair.stop(); } diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu index f831b55b00..2408602e73 100644 --- a/lib/gpu/lal_sw.cu +++ b/lib/gpu/lal_sw.cu @@ -15,13 +15,24 @@ #ifdef NV_KERNEL #include "lal_aux_fun1.h" + #ifndef _DOUBLE_DOUBLE texture pos_tex; +texture sw1_tex; +texture sw2_tex; +texture sw3_tex; #else texture pos_tex; +texture sw1_tex; +texture sw2_tex; +texture sw3_tex; #endif + #else #define pos_tex x_ +#define sw1_tex sw1 +#define sw2_tex sw2 +#define sw3_tex sw3 #endif #define THIRD (numtyp)0.66666667 @@ -60,15 +71,15 @@ texture pos_tex; } \ } \ if (offset==0) { \ - engv+=ii; \ + int ei=ii; \ if (eflag>0) { \ - *engv+=energy*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv+=virial[i]*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]+=virial[i]*(acctyp)0.5; \ + ei+=inum; \ } \ } \ acctyp4 old=ans[ii]; \ @@ -97,15 +108,15 @@ texture pos_tex; } \ } \ if (offset==0) { \ - engv+=ii; \ + int ei=ii; \ if (eflag>0) { \ - *engv+=energy*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv+=virial[i]*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]+=virial[i]*(acctyp)0.5; \ + ei+=inum; \ } \ } \ acctyp4 old=ans[ii]; \ @@ -118,34 +129,20 @@ texture pos_tex; #endif -__kernel void k_sw(const __global numtyp4 *restrict x_, +__kernel void k_sw(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict sw1, + const __global numtyp4 *restrict sw2, + const __global numtyp4 *restrict sw3, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const __global int * dev_nbor, const __global int * dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom, - const numtyp sw_cut, const numtyp sw_epsilon, - const numtyp sw_sigma, const numtyp sw_biga, - const numtyp sw_bigb, const numtyp sw_powerp, - const numtyp sw_powerq, const numtyp sw_cutsq) { - + const int nbor_pitch, const int t_per_atom) { __local int n_stride; - __local numtyp pre_sw_c1, pre_sw_c2, pre_sw_c3, pre_sw_c4; - __local numtyp pre_sw_c5, pre_sw_c6; - pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb* - pow(sw_sigma,sw_powerp); - pre_sw_c2=sw_biga*sw_epsilon*sw_powerq* - pow(sw_sigma,sw_powerq); - pre_sw_c3=sw_biga*sw_epsilon*sw_bigb* - pow(sw_sigma,sw_powerp+(numtyp)1.0); - pre_sw_c4=sw_biga*sw_epsilon* - pow(sw_sigma,sw_powerq+(numtyp)1.0); - pre_sw_c5=sw_biga*sw_epsilon*sw_bigb* - pow(sw_sigma,sw_powerp); - pre_sw_c6=sw_biga*sw_epsilon* - pow(sw_sigma,sw_powerq); - int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -157,7 +154,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii sw_cutsq) continue; + + int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; + numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex); + + if (rsq1 > sw3_ijparam.y) continue; + + numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex); + sw_sigma=sw1_ijparam.y; + sw_gamma=sw1_ijparam.w; + sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; + sw_cut_ij=sw3_ijparam.x; const __global int *nbor_k=nbor_j-offset_j+offset_k; if (nbor_k<=nbor_j) @@ -372,11 +407,31 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, k &= NEIGHMASK; numtyp4 kx; fetch4(kx,k,pos_tex); + int ktype=kx.w; + ktype=map[ktype]; + int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype]; + numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex); + numtyp delr2x = kx.x-ix.x; numtyp delr2y = kx.y-ix.y; numtyp delr2z = kx.z-ix.z; numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z; - if (rsq2 < sw_cutsq) { + if (rsq2 < sw3_ikparam.y) { // sw_cutsq=sw3[ikparam].y; + numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex); + sw_sigma=sw1_ikparam.y; + sw_gamma=sw1_ikparam.w; + sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma; + sw_cut_ik=sw3_ikparam.x; + + int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype]; + numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex); + sw_epsilon=sw1_ijkparam.x; + sw_lambda=sw1_ijkparam.z; + sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon; + sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk; + numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex); + sw_costheta_ijk=sw3_ijkparam.z; + numtyp fjx, fjy, fjz, fkx, fky, fkz; threebody(delr1x,delr1y,delr1z,eflag,energy); @@ -403,23 +458,24 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, } __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict sw1, + const __global numtyp4 *restrict sw2, + const __global numtyp4 *restrict sw3, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const __global int * dev_nbor, const __global int * dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, - const int t_per_atom, const numtyp sw_cut, - const numtyp sw_epsilon, const numtyp sw_sigma, - const numtyp sw_lambda, const numtyp sw_gamma, - const numtyp sw_costheta, const numtyp sw_cutsq) { + const int t_per_atom) { __local int tpa_sq, n_stride; - __local numtyp sw_sigma_gamma, sw_lambda_epsilon; - __local numtyp sw_lambda_epsilon2; tpa_sq=fast_mul(t_per_atom,t_per_atom); - sw_sigma_gamma=sw_sigma*sw_gamma; - sw_lambda_epsilon=sw_lambda*sw_epsilon; - sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon; + numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma; + numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik; + numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk; int tid, ii, offset; atom_info(tpa_sq,ii,tid,offset); @@ -443,23 +499,33 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, int offset_k=tid & (t_per_atom-1); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; - //int iw=ix.w; - //int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + int itype=ix.w; + itype=map[itype]; for ( ; nbor_j sw_cutsq) continue; + + int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; + numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex); + + if (rsq1 > sw3_ijparam.y) continue; + + numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex); + sw_sigma=sw1_ijparam.y; + sw_gamma=sw1_ijparam.w; + sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; + sw_cut_ij=sw3_ijparam.x; const __global int *nbor_k=dev_nbor+j+nbor_pitch; int numk=*nbor_k; @@ -478,15 +544,35 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, int k=*nbor_k; k &= NEIGHMASK; - if (k == i) - continue; + if (k == i) continue; numtyp4 kx; fetch4(kx,k,pos_tex); + int ktype=kx.w; + ktype=map[ktype]; + int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype]; + numtyp delr2x = kx.x - jx.x; numtyp delr2y = kx.y - jx.y; numtyp delr2z = kx.z - jx.z; numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z; - if (rsq2 < sw_cutsq) { + numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex); + + if (rsq2 < sw3_ikparam.y) { + numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex); + sw_sigma=sw1_ikparam.y; + sw_gamma=sw1_ikparam.w; + sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma; + sw_cut_ik=sw3_ikparam.x; + + int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype]; + numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex); + sw_epsilon=sw1_ijkparam.x; + sw_lambda=sw1_ijkparam.z; + sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon; + sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk; + numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex); + sw_costheta_ijk=sw3_ijkparam.z; + numtyp fjx, fjy, fjz; //if (evatom==0) { threebody_half(delr1x,delr1y,delr1z); @@ -512,24 +598,25 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, +__kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict sw1, + const __global numtyp4 *restrict sw2, + const __global numtyp4 *restrict sw3, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const __global int * dev_nbor, const __global int * dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, - const int t_per_atom, const numtyp sw_cut, - const numtyp sw_epsilon, const numtyp sw_sigma, - const numtyp sw_lambda, const numtyp sw_gamma, - const numtyp sw_costheta, const numtyp sw_cutsq) { + const int t_per_atom) { __local int tpa_sq, n_stride; - __local numtyp sw_sigma_gamma, sw_lambda_epsilon; - __local numtyp sw_lambda_epsilon2; tpa_sq=fast_mul(t_per_atom,t_per_atom); - sw_sigma_gamma=sw_sigma*sw_gamma; - sw_lambda_epsilon=sw_lambda*sw_epsilon; - sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon; + numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma; + numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik; + numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk; int tid, ii, offset; atom_info(tpa_sq,ii,tid,offset); @@ -553,26 +640,36 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, int offset_k=tid & (t_per_atom-1); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; - //int iw=ix.w; - //int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + int itype=ix.w; + itype=map[itype]; for ( ; nbor_j sw_cutsq) continue; + int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; + numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex); + + if (rsq1 > sw3_ijparam.y) continue; + + numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex); + sw_sigma=sw1_ijparam.y; + sw_gamma=sw1_ijparam.w; + sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; + sw_cut_ij=sw3_ijparam.x; + const __global int *nbor_k=dev_nbor+j+nbor_pitch; - int numk=*nbor_k; + int numk=*nbor_k; if (dev_nbor==dev_packed) { nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); @@ -588,15 +685,35 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, int k=*nbor_k; k &= NEIGHMASK; - if (k == i) - continue; + if (k == i) continue; numtyp4 kx; fetch4(kx,k,pos_tex); + int ktype=kx.w; + ktype=map[ktype]; + int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype]; + numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex); + numtyp delr2x = kx.x - jx.x; numtyp delr2y = kx.y - jx.y; numtyp delr2z = kx.z - jx.z; numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z; - if (rsq2 < sw_cutsq) { + + if (rsq2 < sw3_ikparam.y) { + numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex); + sw_sigma=sw1_ikparam.y; + sw_gamma=sw1_ikparam.w; + sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma; + sw_cut_ik=sw3_ikparam.x; + + int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype]; + numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex); + sw_epsilon=sw1_ijkparam.x; + sw_lambda=sw1_ijkparam.z; + sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon; + sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk; + numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex); + sw_costheta_ijk=sw3_ijkparam.z; + numtyp fjx, fjy, fjz, fkx, fky, fkz; threebody(delr1x,delr1y,delr1z,eflag,energy); diff --git a/lib/gpu/lal_sw.h b/lib/gpu/lal_sw.h index 82fc7d24e0..66b36a90b0 100644 --- a/lib/gpu/lal_sw.h +++ b/lib/gpu/lal_sw.h @@ -37,13 +37,14 @@ class SW : public BaseThree { * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int nlocal, const int nall, const int max_nbors, + int init(const int ntypes, const int nlocal, const int nall, const int max_nbors, const double cell_size, const double gpu_split, FILE *screen, - const double epsilon, const double sigma, - const double lambda, const double gamma, - const double costheta, const double biga, - const double bigb, const double powerp, - const double powerq, const double cut, const double cutsq); + int* host_map, const int nelements, int*** host_elem2param, const int nparams, + const double* epsilon, const double* sigma, + const double* lambda, const double* gamma, + const double* costheta, const double* biga, + const double* bigb, const double* powerp, + const double* powerq, const double* cut, const double* cutsq); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ @@ -60,11 +61,26 @@ class SW : public BaseThree { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; + /// Number of atom types + int _lj_types; + + /// sw1.x = epsilon, sw1.y = sigma, sw1.z = lambda, sw1.w = gamma + UCL_D_Vec sw1; + /// sw2.x = biga, sw2.y = bigb, sw2.z = powerp, sw2.w = powerq + UCL_D_Vec sw2; + /// sw3.x = cut, sw3.y = cutsq, sw3.z = costheta + UCL_D_Vec sw3; + + UCL_D_Vec elem2param; + UCL_D_Vec map; + int _nparams,_nelements; + + UCL_Texture sw1_tex, sw2_tex, sw3_tex; + private: bool _allocated; void loop(const bool _eflag, const bool _vflag, const int evatom); - numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta; - numtyp sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq; + }; } diff --git a/lib/gpu/lal_sw_ext.cpp b/lib/gpu/lal_sw_ext.cpp index 33abf073d6..16b6828325 100644 --- a/lib/gpu/lal_sw_ext.cpp +++ b/lib/gpu/lal_sw_ext.cpp @@ -27,14 +27,15 @@ static SW SWMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int sw_gpu_init(const int inum, const int nall, const int max_nbors, +int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, const double cell_size, int &gpu_mode, FILE *screen, - const double sw_epsilon, const double sw_sigma, - const double sw_lambda, const double sw_gamma, - const double sw_costheta, const double sw_biga, - const double sw_bigb, const double sw_powerp, - const double sw_powerq, const double sw_cut, - const double sw_cutsq) { + int* host_map, const int nelements, int*** host_elem2param, const int nparams, + const double* sw_epsilon, const double* sw_sigma, + const double* sw_lambda, const double* sw_gamma, + const double* sw_costheta, const double* sw_biga, + const double* sw_bigb, const double* sw_powerp, + const double* sw_powerq, const double* sw_cut, + const double* sw_cutsq) { SWMF.clear(); gpu_mode=SWMF.device->gpu_mode(); double gpu_split=SWMF.device->particle_split(); @@ -55,13 +56,14 @@ int sw_gpu_init(const int inum, const int nall, const int max_nbors, message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) - init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen, + init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, + host_map, nelements, host_elem2param, nparams, sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq); @@ -72,14 +74,15 @@ int sw_gpu_init(const int inum, const int nall, const int max_nbors, for (int i=0; i