From 67ae64329e185cbd2e860f148e0024d1eb1091e7 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 27 Jun 2014 17:25:22 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12146 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/Nvidia.makefile | 32 ++++++++++++++++++++++++-- lib/gpu/Opencl.makefile | 25 +++++++++++++++++++-- lib/gpu/lal_born.cpp | 19 ++++++++++++++++ lib/gpu/lal_born.h | 8 ++++++- lib/gpu/lal_born_ext.cpp | 26 ++++++++++++++++++++++ lib/gpu/lal_buck.cpp | 18 +++++++++++++++ lib/gpu/lal_buck.h | 5 +++++ lib/gpu/lal_buck_ext.cpp | 25 +++++++++++++++++++++ lib/gpu/lal_coul_long.cpp | 38 +++++++++++++++++++------------ lib/gpu/lal_coul_long.cu | 38 ++++++++++++++++++------------- lib/gpu/lal_coul_long.h | 16 ++++++++----- lib/gpu/lal_coul_long_ext.cpp | 42 ++++++++++++++++++++++++++--------- lib/gpu/lal_gauss.cpp | 15 +++++++++++++ lib/gpu/lal_gauss.h | 8 +++++-- lib/gpu/lal_gauss_ext.cpp | 22 ++++++++++++++++++ lib/gpu/lal_lj.cpp | 17 ++++++++++++++ lib/gpu/lal_lj.h | 7 +++++- lib/gpu/lal_lj_expand.cpp | 20 +++++++++++++++++ lib/gpu/lal_lj_expand.h | 7 +++++- lib/gpu/lal_lj_expand_ext.cpp | 23 +++++++++++++++++++ lib/gpu/lal_lj_ext.cpp | 21 ++++++++++++++++++ lib/gpu/lal_soft.cpp | 15 +++++++++++++ lib/gpu/lal_soft.h | 8 +++++-- lib/gpu/lal_soft_ext.cpp | 22 ++++++++++++++++++ 24 files changed, 422 insertions(+), 55 deletions(-) diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index da07cec1b7..f38b443845 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -67,7 +67,9 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \ $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \ $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \ $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \ - $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o + $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o \ + $(OBJ_DIR)/lal_coul.o $(OBJ_DIR)/lal_coul_ext.o \ + $(OBJ_DIR)/lal_coul_debye.o $(OBJ_DIR)/lal_coul_debye_ext.o CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \ @@ -114,7 +116,9 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \ $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \ $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h \ - $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd_cubin.h + $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd_cubin.h \ + $(OBJ_DIR)/coul.cubin $(OBJ_DIR)/coul_cubin.h \ + $(OBJ_DIR)/coul_debye.cubin $(OBJ_DIR)/coul_debye_cubin.h all: $(OBJ_DIR) $(GPU_LIB) $(EXECS) @@ -676,6 +680,30 @@ $(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp $(OBJ_DIR)/dpd_cubin.h $(OB $(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h $(CUDR) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/coul.cubin: lal_coul.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul.cu + +$(OBJ_DIR)/coul_cubin.h: $(OBJ_DIR)/coul.cubin $(OBJ_DIR)/coul.cubin + $(BIN2C) -c -n coul $(OBJ_DIR)/coul.cubin > $(OBJ_DIR)/coul_cubin.h + +$(OBJ_DIR)/lal_coul.o: $(ALL_H) lal_coul.h lal_coul.cpp $(OBJ_DIR)/coul_cubin.h $(OBJ_DIR)/lal_base_charge.o + $(CUDR) -o $@ -c lal_coul.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_coul_ext.o: $(ALL_H) lal_coul.h lal_coul_ext.cpp lal_base_charge.h + $(CUDR) -o $@ -c lal_coul_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/coul_debye.cubin: lal_coul_debye.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_debye.cu + +$(OBJ_DIR)/coul_debye_cubin.h: $(OBJ_DIR)/coul_debye.cubin $(OBJ_DIR)/coul_debye.cubin + $(BIN2C) -c -n coul_debye $(OBJ_DIR)/coul_debye.cubin > $(OBJ_DIR)/coul_debye_cubin.h + +$(OBJ_DIR)/lal_coul_debye.o: $(ALL_H) lal_coul_debye.h lal_coul_debye.cpp $(OBJ_DIR)/coul_debye_cubin.h $(OBJ_DIR)/lal_base_charge.o + $(CUDR) -o $@ -c lal_coul_debye.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_coul_debye_ext.o: $(ALL_H) lal_coul_debye.h lal_coul_debye_ext.cpp lal_base_charge.h + $(CUDR) -o $@ -c lal_coul_debye_ext.cpp -I$(OBJ_DIR) + $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H) $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 1d85c65354..b54704f63c 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -56,7 +56,9 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \ $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \ $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \ $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \ - $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o + $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o \ + $(OBJ_DIR)/lal_coul.o $(OBJ_DIR)/lal_coul_ext.o \ + $(OBJ_DIR)/lal_coul_debye.o $(OBJ_DIR)/lal_coul_debye_ext.o KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \ @@ -79,7 +81,8 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \ $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \ $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \ - $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/dpd_cl.h + $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/dpd_cl.h \ + $(OBJ_DIR)/coul_cl.h $(OBJ_DIR)/coul_debye_cl.h OCL_EXECS = $(BIN_DIR)/ocl_get_devices @@ -485,6 +488,24 @@ $(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp $(OBJ_DIR)/dpd_cl.h $(OBJ_ $(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h $(OCL) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/coul_cl.h: lal_coul.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh coul $(PRE1_H) lal_coul.cu $(OBJ_DIR)/coul_cl.h; + +$(OBJ_DIR)/lal_coul.o: $(ALL_H) lal_coul.h lal_coul.cpp $(OBJ_DIR)/coul_cl.h $(OBJ_DIR)/coul_cl.h $(OBJ_DIR)/lal_base_charge.o + $(OCL) -o $@ -c lal_coul.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_coul_ext.o: $(ALL_H) lal_coul.h lal_coul_ext.cpp lal_base_charge.h + $(OCL) -o $@ -c lal_coul_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/coul_debye_cl.h: lal_coul_debye.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh coul_debye $(PRE1_H) lal_coul_debye.cu $(OBJ_DIR)/coul_debye_cl.h; + +$(OBJ_DIR)/lal_coul_debye.o: $(ALL_H) lal_coul_debye.h lal_coul_debye.cpp $(OBJ_DIR)/coul_debye_cl.h $(OBJ_DIR)/coul_debye_cl.h $(OBJ_DIR)/lal_base_charge.o + $(OCL) -o $@ -c lal_coul_debye.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_coul_debye_ext.o: $(ALL_H) lal_coul_debye.h lal_coul_debye_ext.cpp lal_base_charge.h + $(OCL) -o $@ -c lal_coul_debye_ext.cpp -I$(OBJ_DIR) + $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp index 719b1845d8..55cb24d3b0 100644 --- a/lib/gpu/lal_born.cpp +++ b/lib/gpu/lal_born.cpp @@ -97,6 +97,25 @@ int BornT::init(const int ntypes, double **host_cutsq, return 0; } +template +void BornT::reinit(const int ntypes, double **host_rhoinv, + double **host_born1, double **host_born2, + double **host_born3, double **host_a, double **host_c, + double **host_d, double **host_offset) { + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; i<_lj_types*_lj_types; i++) + host_write[i]=0.0; + + this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv, + host_born1,host_born2,host_born3); + this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c, + host_d,host_offset); +} + template void BornT::clear() { if (!_allocated) diff --git a/lib/gpu/lal_born.h b/lib/gpu/lal_born.h index fe9c2a1819..6fed6461d2 100644 --- a/lib/gpu/lal_born.h +++ b/lib/gpu/lal_born.h @@ -45,7 +45,13 @@ class Born : public BaseAtomic { const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **host_rhoinv, + double **host_born1, double **host_born2, + double **host_born3, double **host_a, double **host_c, + double **host_d, double **host_offset); + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp index 3b593efb23..4dcc899623 100644 --- a/lib/gpu/lal_born_ext.cpp +++ b/lib/gpu/lal_born_ext.cpp @@ -92,6 +92,32 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, return init_ok; } +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +void born_gpu_reinit(const int ntypes, double **host_rhoinv, + double **host_born1, double **host_born2, + double **host_born3, double **host_a, double **host_c, + double **host_d, double **offset) { + int world_me=BORNMF.device->world_me(); + int gpu_rank=BORNMF.device->gpu_rank(); + int procs_per_gpu=BORNMF.device->procs_per_gpu(); + + if (world_me==0) + BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2, + host_born3, host_a, host_c, host_d, offset); + + BORNMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + void born_gpu_clear() { BORNMF.clear(); } diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp index 53bfe3fae3..f66759ee3a 100644 --- a/lib/gpu/lal_buck.cpp +++ b/lib/gpu/lal_buck.cpp @@ -91,6 +91,24 @@ int BuckT::init(const int ntypes, double **host_cutsq, return 0; } +template +void BuckT::reinit(const int ntypes, double **host_cutsq, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset) { + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; i<_lj_types*_lj_types; i++) + host_write[i]=0.0; + + this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv, + host_buck1,host_buck2,host_cutsq); + this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c, + host_offset); +} + template void BuckT::clear() { if (!_allocated) diff --git a/lib/gpu/lal_buck.h b/lib/gpu/lal_buck.h index b0b6347c47..ebcd72d990 100644 --- a/lib/gpu/lal_buck.h +++ b/lib/gpu/lal_buck.h @@ -45,6 +45,11 @@ class Buck : public BaseAtomic { const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **host_cutsq, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset); + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp index af1f826751..b118a5b0eb 100644 --- a/lib/gpu/lal_buck_ext.cpp +++ b/lib/gpu/lal_buck_ext.cpp @@ -89,6 +89,31 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, return init_ok; } +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv, + double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **offset) { + int world_me=BUCKMF.device->world_me(); + int gpu_rank=BUCKMF.device->gpu_rank(); + int procs_per_gpu=BUCKMF.device->procs_per_gpu(); + + if (world_me==0) + BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + host_a, host_c, offset); + + BUCKMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + void buck_gpu_clear() { BUCKMF.clear(); } diff --git a/lib/gpu/lal_coul_long.cpp b/lib/gpu/lal_coul_long.cpp index 53de36474d..ab0c8ce665 100644 --- a/lib/gpu/lal_coul_long.cpp +++ b/lib/gpu/lal_coul_long.cpp @@ -43,21 +43,19 @@ int CoulLongT::bytes_per_atom(const int max_nbors) const { } template -int CoulLongT::init(const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen, - const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald) { +int CoulLongT::init(const int ntypes, double **host_scale, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, gpu_split,_screen,coul_long,"k_coul_long"); if (success!=0) return success; - // we don't have atom types for coulomb only, - // but go with the minimum so that we can use - // the same infrastructure as lj/cut/coul/long/gpu. - int lj_types=1; + int lj_types=ntypes; shared_types=false; int max_shared_types=this->device->max_shared_types(); if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { @@ -69,13 +67,16 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors, // Allocate a host write buffer for data initialization UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; iucl_device),UCL_READ_ONLY); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); - + + scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale); + sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_coul[i]; @@ -87,10 +88,18 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors, _g_ewald=g_ewald; _allocated=true; - this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_cl.row_bytes(); + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+scale.row_bytes()+ + sp_cl.row_bytes(); return 0; } +template +void CoulLongT::reinit(const int ntypes, double **host_scale) { + UCL_H_Vec hscale(_lj_types*_lj_types,*(this->ucl_device), + UCL_WRITE_ONLY); + this->atom->type_pack1(ntypes,_lj_types,scale,hscale,host_scale); +} + template void CoulLongT::clear() { if (!_allocated) @@ -99,6 +108,7 @@ void CoulLongT::clear() { lj1.clear(); lj3.clear(); + scale.clear(); sp_cl.clear(); this->clear_atomic(); } @@ -134,7 +144,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_cl, + this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, @@ -142,7 +152,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) { &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_cl, + this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu index 5eb89f9c42..fcc69c44ab 100644 --- a/lib/gpu/lal_coul_long.cu +++ b/lib/gpu/lal_coul_long.cu @@ -124,8 +124,7 @@ texture q_tex; #endif __kernel void k_coul_long(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, + const __global numtyp *restrict scale, const int lj_types, const __global numtyp *restrict sp_cl_in, const __global int *dev_nbor, @@ -161,6 +160,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_, n_stride,list_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + int itype=ix.w; numtyp qtmp; fetch(qtmp,i,q_tex); for ( ; nbor0) { - e_coul += prefactor*(_erfc-factor_coul); + e_coul += prefactor*(_erfc-factor_coul); } if (vflag>0) { virial[0] += delx*delx*force; @@ -215,8 +217,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_, } __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp *restrict scale_in, const __global numtyp *restrict sp_cl_in, const __global int *dev_nbor, const __global int *dev_packed, @@ -230,10 +231,14 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); + __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_cl[4]; if (tid<4) sp_cl[tid]=sp_cl_in[tid]; - + if (tid0) { - e_coul += prefactor*(_erfc-factor_coul); + e_coul += prefactor*(_erfc-factor_coul); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_coul_long.h b/lib/gpu/lal_coul_long.h index 62103e7266..7e3f7f36a7 100644 --- a/lib/gpu/lal_coul_long.h +++ b/lib/gpu/lal_coul_long.h @@ -37,12 +37,16 @@ class CoulLong : public BaseCharge { * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int nlocal, const int nall, const int max_nbors, + int init(const int ntypes, double **scale, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, - const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald); - + const double gpu_split, FILE *screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **scale); + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -59,6 +63,8 @@ class CoulLong : public BaseCharge { UCL_D_Vec lj1; /// lj3 dummy UCL_D_Vec lj3; + /// scale + UCL_D_Vec scale; /// Special Coul values [0-3] UCL_D_Vec sp_cl; diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp index 904c33527f..49d593d9a0 100644 --- a/lib/gpu/lal_coul_long_ext.cpp +++ b/lib/gpu/lal_coul_long_ext.cpp @@ -27,10 +27,11 @@ static CoulLong CLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int cl_gpu_init(const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen, double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald) { +int cl_gpu_init(const int ntypes, double **host_scale, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald) { CLMF.clear(); gpu_mode=CLMF.device->gpu_mode(); double gpu_split=CLMF.device->particle_split(); @@ -53,9 +54,9 @@ int cl_gpu_init(const int inum, const int nall, const int max_nbors, int init_ok=0; if (world_me==0) - init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split, - screen, host_cut_coulsq, host_special_coul, qqrd2e, - g_ewald); + init_ok=CLMF.init(ntypes, host_scale, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_coulsq, + host_special_coul, qqrd2e, g_ewald); CLMF.device->world_barrier(); if (message) @@ -71,9 +72,9 @@ int cl_gpu_init(const int inum, const int nall, const int max_nbors, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split, - screen, host_cut_coulsq, host_special_coul, - qqrd2e, g_ewald); + init_ok=CLMF.init(ntypes, host_scale, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_coulsq, + host_special_coul, qqrd2e, g_ewald); CLMF.device->gpu_barrier(); if (message) @@ -87,6 +88,27 @@ int cl_gpu_init(const int inum, const int nall, const int max_nbors, return init_ok; } +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +void cl_gpu_reinit(const int ntypes, double **host_scale) { + int world_me=CLMF.device->world_me(); + int gpu_rank=CLMF.device->gpu_rank(); + int procs_per_gpu=CLMF.device->procs_per_gpu(); + + if (world_me==0) + CLMF.reinit(ntypes, host_scale); + + CLMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + void cl_gpu_clear() { CLMF.clear(); } diff --git a/lib/gpu/lal_gauss.cpp b/lib/gpu/lal_gauss.cpp index 9e296e6872..342ec4ecda 100644 --- a/lib/gpu/lal_gauss.cpp +++ b/lib/gpu/lal_gauss.cpp @@ -87,6 +87,21 @@ int GaussT::init(const int ntypes, return 0; } +template +void GaussT::reinit(const int ntypes, double **host_cutsq, double **host_a, + double **host_b, double **host_offset) { + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; i<_lj_types*_lj_types; i++) + host_write[i]=0.0; + + this->atom->type_pack4(ntypes,_lj_types,gauss1,host_write,host_a,host_b, + host_cutsq,host_offset); +} + template void GaussT::clear() { if (!_allocated) diff --git a/lib/gpu/lal_gauss.h b/lib/gpu/lal_gauss.h index fad790899d..1fd58adae5 100644 --- a/lib/gpu/lal_gauss.h +++ b/lib/gpu/lal_gauss.h @@ -14,7 +14,7 @@ ***************************************************************************/ #ifndef LAL_GAUSS_H -#define LAL_GAYSS_H +#define LAL_GAUSS_H #include "lal_base_atomic.h" @@ -43,7 +43,11 @@ class Gauss : public BaseAtomic { const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **host_cutsq, + double **host_a, double **host_b, double **host_offset); + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp index 818c7cbd26..fdef451366 100644 --- a/lib/gpu/lal_gauss_ext.cpp +++ b/lib/gpu/lal_gauss_ext.cpp @@ -88,6 +88,28 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, return init_ok; } +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +void gauss_gpu_reinit(const int ntypes, double **cutsq, double **host_a, + double **host_b, double **offset) { + int world_me=GLMF.device->world_me(); + int gpu_rank=GLMF.device->gpu_rank(); + int procs_per_gpu=GLMF.device->procs_per_gpu(); + + if (world_me==0) + GLMF.reinit(ntypes, cutsq, host_a, host_b, offset); + + GLMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + void gauss_gpu_clear() { GLMF.clear(); } diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp index 7cc3ee48b3..6c6e145319 100644 --- a/lib/gpu/lal_lj.cpp +++ b/lib/gpu/lal_lj.cpp @@ -92,6 +92,23 @@ int LJT::init(const int ntypes, return 0; } +template +void LJT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset) { + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; i<_lj_types*_lj_types; i++) + host_write[i]=0.0; + + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq); + this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); +} + template void LJT::clear() { if (!_allocated) diff --git a/lib/gpu/lal_lj.h b/lib/gpu/lal_lj.h index 9555a3c881..63a3e8a6c9 100644 --- a/lib/gpu/lal_lj.h +++ b/lib/gpu/lal_lj.h @@ -43,7 +43,12 @@ class LJ : public BaseAtomic { const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset); + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); diff --git a/lib/gpu/lal_lj_expand.cpp b/lib/gpu/lal_lj_expand.cpp index 474a91c6b3..03526bc095 100644 --- a/lib/gpu/lal_lj_expand.cpp +++ b/lib/gpu/lal_lj_expand.cpp @@ -92,6 +92,26 @@ int LJExpandT::init(const int ntypes, double **host_cutsq, return 0; } +template +void LJExpandT::reinit(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, + double **host_offset, double **host_shift) { + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; i<_lj_types*_lj_types; i++) + host_write[i]=0.0; + + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_shift); + + this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); +} + template void LJExpandT::clear() { if (!_allocated) diff --git a/lib/gpu/lal_lj_expand.h b/lib/gpu/lal_lj_expand.h index fd59045bf8..0d0ae0b2e6 100644 --- a/lib/gpu/lal_lj_expand.h +++ b/lib/gpu/lal_lj_expand.h @@ -43,7 +43,12 @@ class LJExpand : public BaseAtomic { const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_shift); + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp index 725004b0ef..c1cc99fd12 100644 --- a/lib/gpu/lal_lj_expand_ext.cpp +++ b/lib/gpu/lal_lj_expand_ext.cpp @@ -89,6 +89,29 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1, return init_ok; } +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +int lje_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **shift) { + int world_me=LJEMF.device->world_me(); + int gpu_rank=LJEMF.device->gpu_rank(); + int procs_per_gpu=LJEMF.device->procs_per_gpu(); + + if (world_me==0) + LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, shift); + LJEMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + void lje_gpu_clear() { LJEMF.clear(); } diff --git a/lib/gpu/lal_lj_ext.cpp b/lib/gpu/lal_lj_ext.cpp index 3d85f689d0..674c1984bc 100644 --- a/lib/gpu/lal_lj_ext.cpp +++ b/lib/gpu/lal_lj_ext.cpp @@ -88,6 +88,27 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, return init_ok; } +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset) { + int world_me=LJLMF.device->world_me(); + int gpu_rank=LJLMF.device->gpu_rank(); + int procs_per_gpu=LJLMF.device->procs_per_gpu(); + + if (world_me==0) + LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset); + LJLMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + void ljl_gpu_clear() { LJLMF.clear(); } diff --git a/lib/gpu/lal_soft.cpp b/lib/gpu/lal_soft.cpp index 8db144080e..c206a997a9 100644 --- a/lib/gpu/lal_soft.cpp +++ b/lib/gpu/lal_soft.cpp @@ -86,6 +86,21 @@ int SoftT::init(const int ntypes, double **host_cutsq, return 0; } +template +void SoftT::reinit(const int ntypes, double **host_cutsq, + double **host_prefactor, double **host_cut) { + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; i<_lj_types*_lj_types; i++) + host_write[i]=0.0; + + this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_prefactor, + host_cut,host_cutsq); +} + template void SoftT::clear() { if (!_allocated) diff --git a/lib/gpu/lal_soft.h b/lib/gpu/lal_soft.h index 40e568e847..7fa529c4f5 100644 --- a/lib/gpu/lal_soft.h +++ b/lib/gpu/lal_soft.h @@ -13,8 +13,8 @@ email : nguyentd@ornl.gov ***************************************************************************/ -#ifndef LAL_GAUSS_H -#define LAL_GAYSS_H +#ifndef LAL_SOFT_H +#define LAL_SOFT_H #include "lal_base_atomic.h" @@ -44,6 +44,10 @@ class Soft : public BaseAtomic { const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **host_cutsq, + double **host_prefactor, double **host_cut); + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); diff --git a/lib/gpu/lal_soft_ext.cpp b/lib/gpu/lal_soft_ext.cpp index 1872388bb3..5e7f38d054 100644 --- a/lib/gpu/lal_soft_ext.cpp +++ b/lib/gpu/lal_soft_ext.cpp @@ -88,6 +88,28 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor, return init_ok; } +// --------------------------------------------------------------------------- +// Copy updated constants to device +// --------------------------------------------------------------------------- +void soft_gpu_reinit(const int ntypes, double **cutsq, double **host_prefactor, + double **host_cut) { + int world_me=SLMF.device->world_me(); + int gpu_rank=SLMF.device->gpu_rank(); + int procs_per_gpu=SLMF.device->procs_per_gpu(); + + if (world_me==0) + SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut); + + SLMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + void soft_gpu_clear() { SLMF.clear(); }