From e7fc403e3cdc929779fc565c4a2f97d59b1acaf5 Mon Sep 17 00:00:00 2001 From: "W. Michael Brown" Date: Tue, 12 Apr 2011 20:04:38 -0400 Subject: [PATCH] Updating CHARMM test kernel. --- lib/gpu/charge_gpu_memory2.cpp | 8 ++- lib/gpu/charge_gpu_memory2.h | 3 +- lib/gpu/crml_gpu_kernel2.cu | 120 +++++++++++++++++++++++++-------- lib/gpu/crml_gpu_memory2.cpp | 14 ++-- lib/gpu/pair_gpu_device.h | 6 ++ lib/gpu/pair_gpu_precision.h | 2 - 6 files changed, 113 insertions(+), 40 deletions(-) diff --git a/lib/gpu/charge_gpu_memory2.cpp b/lib/gpu/charge_gpu_memory2.cpp index 7ba035006a..7683d1b14f 100644 --- a/lib/gpu/charge_gpu_memory2.cpp +++ b/lib/gpu/charge_gpu_memory2.cpp @@ -25,7 +25,7 @@ ChargeGPUMemory2T::ChargeGPUMemory2() : _compiled(false), _max_bytes(0) { device=&pair_gpu_device; ans=new PairGPUAns(); nbor=new PairGPUNbor(); - nbor->packing(true); + _threads_per_atom=1; } template @@ -58,6 +58,12 @@ int ChargeGPUMemory2T::init_atomic(const int nlocal, const int nall, if (host_nlocal>0) _gpu_host=1; + if (_threads_per_atom>1 && gpu_nbor==false) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,false); if (success!=0) diff --git a/lib/gpu/charge_gpu_memory2.h b/lib/gpu/charge_gpu_memory2.h index e2accf4a2d..070008faaa 100644 --- a/lib/gpu/charge_gpu_memory2.h +++ b/lib/gpu/charge_gpu_memory2.h @@ -187,9 +187,10 @@ class ChargeGPUMemory2 { protected: bool _compiled; - int _block_size; + int _block_size, _threads_per_atom; double _max_bytes, _max_an_bytes; double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const char *pair_string); diff --git a/lib/gpu/crml_gpu_kernel2.cu b/lib/gpu/crml_gpu_kernel2.cu index a01eb64c7b..eda49f7411 100644 --- a/lib/gpu/crml_gpu_kernel2.cu +++ b/lib/gpu/crml_gpu_kernel2.cu @@ -19,6 +19,7 @@ #define CRML_GPU_KERNEL #define MAX_BIO_SHARED_TYPES 128 +#define BLOCK_PAIR 64 #ifdef _DOUBLE_DOUBLE #define numtyp double @@ -98,18 +99,16 @@ __inline float fetch_q(const int& i, const float *q) __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, - const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, + const int lj_types, __global numtyp *sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const numtyp denom_lj, const numtyp cut_bothsq, - const numtyp cut_ljsq, const numtyp cut_lj_innersq) { - - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + const numtyp cut_ljsq, const numtyp cut_lj_innersq, + const int t_per_atom) { __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -120,29 +119,50 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - if (ii1) { + __local acctyp s_energy[BLOCK_PAIR]; + __local acctyp s_e_coul[BLOCK_PAIR]; + __local acctyp4 s_f[BLOCK_PAIR]; + __local acctyp s_virial[6][BLOCK_PAIR]; + + s_f[tid].x=f.x; + s_f[tid].y=f.y; + s_f[tid].z=f.z; + s_energy[tid]=energy; + s_e_coul[tid]=e_coul; + for (int v=0; v<6; v++) + s_virial[v][tid]=virial[v]; - // Store answers - __global acctyp *ap1=engv+ii; + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + s_f[tid].x += s_f[tid+s].x; + s_f[tid].y += s_f[tid+s].y; + s_f[tid].z += s_f[tid+s].z; + s_energy[tid] += s_energy[tid+s]; + s_e_coul[tid] += s_e_coul[tid+s]; + s_virial[0][tid] += s_virial[0][tid+s]; + s_virial[1][tid] += s_virial[1][tid+s]; + s_virial[2][tid] += s_virial[2][tid+s]; + s_virial[3][tid] += s_virial[3][tid+s]; + s_virial[4][tid] += s_virial[4][tid+s]; + s_virial[5][tid] += s_virial[5][tid+s]; + } + } + + f.x=s_f[tid].x; + f.y=s_f[tid].y; + f.z=s_f[tid].z; + energy=s_energy[tid]; + e_coul=s_e_coul[tid]; + for (int v=0; v<6; v++) + virial[v]=s_virial[v][tid]; + } + + // Store answers + __global acctyp *ap1=engv+ii; + if (ii0) { *ap1=energy; ap1+=inum; @@ -235,12 +297,12 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, } } ans[ii]=f; - } // if ii + } } __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, __global numtyp* sp_lj_in, __global int *dev_nbor, - __global int *dev_list, __global acctyp4 *ans, + __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_, @@ -261,7 +323,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, if (tid<8) sp_lj[tid]=sp_lj_in[tid]; ljd[tid]=ljd_in[tid]; - ljd[tid+64]=ljd_in[tid+64]; + ljd[tid+BLOCK_PAIR]=ljd_in[tid+BLOCK_PAIR]; int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); ii+=tid/t_per_atom; @@ -285,12 +347,12 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, int n_stride; __global int *list_end; - if (dev_nbor==dev_list) { + if (dev_nbor==dev_packed) { list_end=nbor+mul24(numj,nbor_pitch); nbor+=mul24(offset,nbor_pitch); n_stride=mul24(t_per_atom,nbor_pitch); } else { - nbor=dev_list+*nbor; + nbor=dev_packed+*nbor; list_end=nbor+numj; n_stride=t_per_atom; nbor+=offset; @@ -387,10 +449,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, // Reduce answers if (t_per_atom>1) { - __local acctyp s_energy[64]; - __local acctyp s_e_coul[64]; - __local acctyp4 s_f[64]; - __local acctyp s_virial[6][64]; + __local acctyp s_energy[BLOCK_PAIR]; + __local acctyp s_e_coul[BLOCK_PAIR]; + __local acctyp4 s_f[BLOCK_PAIR]; + __local acctyp s_virial[6][BLOCK_PAIR]; s_f[tid].x=f.x; s_f[tid].y=f.y; diff --git a/lib/gpu/crml_gpu_memory2.cpp b/lib/gpu/crml_gpu_memory2.cpp index 764fa5f48c..dd817dfc46 100644 --- a/lib/gpu/crml_gpu_memory2.cpp +++ b/lib/gpu/crml_gpu_memory2.cpp @@ -127,8 +127,6 @@ double CRML_GPU_Memory2T::host_memory_usage() const { // --------------------------------------------------------------------------- template void CRML_GPU_Memory2T::loop(const bool _eflag, const bool _vflag) { - const int threads_per_atom=16; - // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int eflag, vflag; @@ -143,7 +141,7 @@ void CRML_GPU_Memory2T::loop(const bool _eflag, const bool _vflag) { vflag=0; int GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/threads_per_atom))); + (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int anall=this->atom->nall(); @@ -153,22 +151,24 @@ void CRML_GPU_Memory2T::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->nbor->dev_packed.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq, - &_cut_ljsq, &_cut_lj_innersq,&threads_per_atom); + &_cut_ljsq, &_cut_lj_innersq, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->ans->dev_ans.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, - &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq); + &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, + &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index b11bfbc356..7c37afeab9 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -42,6 +42,12 @@ #define BLOCK_CELL_ID 128 // Default block size for neighbor list builds #define BLOCK_NBOR_BUILD 64 +// Maximum number of atom types that can be stored in shared memory +// - Must be sqrt of BLOCK_PAIR +#define MAX_SHARED_TYPES 8 +// Maximum number of atom types that can be stored in shared memory for bio +// - Must be BLOCK_PAIR*2 +#define MAX_BIO_SHARED_TYPES 128 template class PPPMGPUMemory; diff --git a/lib/gpu/pair_gpu_precision.h b/lib/gpu/pair_gpu_precision.h index a5f57c1f95..902975be0b 100644 --- a/lib/gpu/pair_gpu_precision.h +++ b/lib/gpu/pair_gpu_precision.h @@ -84,8 +84,6 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { #define acctyp4 _lgpu_float4 #endif -#define MAX_SHARED_TYPES 8 -#define MAX_BIO_SHARED_TYPES 128 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #endif