diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp index 3205a8289d..6d3eca0996 100644 --- a/lib/gpu/atomic_gpu_memory.cpp +++ b/lib/gpu/atomic_gpu_memory.cpp @@ -57,6 +57,13 @@ int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, if (host_nlocal>0) _gpu_host=1; + _threads_per_atom=device->threads_per_atom(); + if (_threads_per_atom>1 && gpu_nbor==false) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,false); if (success!=0) diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h index d82f4371d3..02ee24fab0 100644 --- a/lib/gpu/atomic_gpu_memory.h +++ b/lib/gpu/atomic_gpu_memory.h @@ -189,9 +189,10 @@ class AtomicGPUMemory { protected: bool _compiled; - int _block_size; + int _block_size, _threads_per_atom; double _max_bytes, _max_an_bytes; double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const char *pair_string); diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp index 2d8bc3ddbd..8c8f231bf8 100644 --- a/lib/gpu/charge_gpu_memory.cpp +++ b/lib/gpu/charge_gpu_memory.cpp @@ -57,6 +57,13 @@ int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, if (host_nlocal>0) _gpu_host=1; + _threads_per_atom=device->threads_per_atom(); + if (_threads_per_atom>1 && gpu_nbor==false) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,false); if (success!=0) diff --git a/lib/gpu/charge_gpu_memory.h b/lib/gpu/charge_gpu_memory.h index a41440fe02..86e97b7728 100644 --- a/lib/gpu/charge_gpu_memory.h +++ b/lib/gpu/charge_gpu_memory.h @@ -187,9 +187,10 @@ class ChargeGPUMemory { protected: bool _compiled; - int _block_size, _block_bio_size; + int _block_size, _block_bio_size, _threads_per_atom; double _max_bytes, _max_an_bytes; double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const char *pair_string); diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu index 5b17813d27..b2b7796025 100644 --- a/lib/gpu/cmm_cut_gpu_kernel.cu +++ b/lib/gpu/cmm_cut_gpu_kernel.cu @@ -70,6 +70,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define __inline inline #define fetch_pos(i,y) x_[i] +#define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #endif @@ -81,40 +82,56 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -182,49 +238,64 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in,__global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + __global numtyp* sp_lj_in,__global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; - if (ii<4) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/cmm_cut_gpu_memory.cpp b/lib/gpu/cmm_cut_gpu_memory.cpp index 3a2c11300e..8a5949c9e7 100644 --- a/lib/gpu/cmm_cut_gpu_memory.cpp +++ b/lib/gpu/cmm_cut_gpu_memory.cpp @@ -126,7 +126,8 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int anall=this->atom->nall(); @@ -137,16 +138,18 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &anall, &nbor_pitch); + &ainum, &anall, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->ans->dev_ans.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &anall, &nbor_pitch); + &anall, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu index c9f437f891..c99a2a68c2 100644 --- a/lib/gpu/cmmc_long_gpu_kernel.cu +++ b/lib/gpu/cmmc_long_gpu_kernel.cu @@ -88,6 +88,7 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #endif @@ -99,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, __global numtyp *q_ , + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -116,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -233,51 +291,67 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const numtyp qqrd2e, const numtyp g_ewald, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/cmmc_long_gpu_memory.cpp b/lib/gpu/cmmc_long_gpu_memory.cpp index 2073c99074..e2f99fceca 100644 --- a/lib/gpu/cmmc_long_gpu_memory.cpp +++ b/lib/gpu/cmmc_long_gpu_memory.cpp @@ -137,7 +137,8 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int anall=this->atom->nall(); @@ -148,19 +149,21 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, - &_qqrd2e, &_g_ewald); + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->ans->dev_ans.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), - &_cut_coulsq, &_qqrd2e, &_g_ewald); + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/cmmc_msm_gpu_kernel.cu b/lib/gpu/cmmc_msm_gpu_kernel.cu index 913587ce52..5ad765f8c4 100644 --- a/lib/gpu/cmmc_msm_gpu_kernel.cu +++ b/lib/gpu/cmmc_msm_gpu_kernel.cu @@ -80,6 +80,7 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #endif @@ -91,13 +92,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const int smooth) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const int smooth, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -111,33 +116,45 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __local numtyp _ia2; __local numtyp _ia3; + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -252,24 +310,39 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const int smooth) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const numtyp qqrd2e, const int smooth, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/cmmc_msm_gpu_memory.cpp b/lib/gpu/cmmc_msm_gpu_memory.cpp index d48e3c505f..ca051d4803 100644 --- a/lib/gpu/cmmc_msm_gpu_memory.cpp +++ b/lib/gpu/cmmc_msm_gpu_memory.cpp @@ -137,7 +137,8 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int anall=this->atom->nall(); @@ -148,19 +149,21 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, - &_qqrd2e, &_smooth); + &_qqrd2e, &_smooth, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->ans->dev_ans.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), - &_cut_coulsq, &_qqrd2e, &_smooth); + &_cut_coulsq, &_qqrd2e, &_smooth, + &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/crml_gpu_kernel.cu b/lib/gpu/crml_gpu_kernel.cu index 0a03ea072b..faea43341a 100644 --- a/lib/gpu/crml_gpu_kernel.cu +++ b/lib/gpu/crml_gpu_kernel.cu @@ -90,6 +90,7 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_BIO_PAIR 64 #endif @@ -98,18 +99,22 @@ __inline float fetch_q(const int& i, const float *q) __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, - const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, + const int lj_types, __global numtyp *sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const numtyp denom_lj, const numtyp cut_bothsq, - const numtyp cut_ljsq, const numtyp cut_lj_innersq) { + const numtyp cut_ljsq, const numtyp cut_lj_innersq, + const int t_per_atom) { + + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -120,29 +125,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -246,45 +304,59 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, __global numtyp *q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const numtyp denom_lj, const numtyp cut_bothsq, - const numtyp cut_ljsq, - const numtyp cut_lj_innersq) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const numtyp cut_ljsq, + const numtyp cut_lj_innersq, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp2 ljd[MAX_BIO_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - ljd[ii]=ljd_in[ii]; - if (ii+641) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/crml_gpu_memory.cpp b/lib/gpu/crml_gpu_memory.cpp index 2b30b8fc94..6661f67585 100644 --- a/lib/gpu/crml_gpu_memory.cpp +++ b/lib/gpu/crml_gpu_memory.cpp @@ -141,7 +141,8 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int anall=this->atom->nall(); @@ -151,21 +152,24 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq, - &_cut_ljsq, &_cut_lj_innersq); + &_cut_ljsq, &_cut_lj_innersq, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->ans->dev_ans.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, - &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq); + &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, + &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/gb_gpu.cpp b/lib/gpu/gb_gpu.cpp index 4593ba5452..70eb4d9344 100644 --- a/lib/gpu/gb_gpu.cpp +++ b/lib/gpu/gb_gpu.cpp @@ -214,7 +214,8 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(gbm.ans->inum())/BX)); + int GX=static_cast(ceil(static_cast(gbm.ans->inum())/ + (BX/gbm._threads_per_atom))); int stride=gbm.nbor->nbor_pitch(); int ainum=gbm.ans->inum(); int anall=gbm.atom->nall(); @@ -224,7 +225,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { if (gbm.last_ellipse>0) { // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- GX=static_cast(ceil(static_cast(gbm.last_ellipse)/ - static_cast(BX))); + (BX/gbm._threads_per_atom))); gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE, ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); @@ -236,7 +237,8 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(), - &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall); + &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall, + &gbm._threads_per_atom); gbm.time_gayberne.stop(); if (gbm.last_ellipse==gbm.ans->inum()) { @@ -253,7 +255,8 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { gbm.time_kernel2.start(); GX=static_cast(ceil(static_cast(gbm.ans->inum()- - gbm.last_ellipse)/BX)); + gbm.last_ellipse)/ + (BX/gbm._threads_per_atom))); gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(), SPHERE_ELLIPSE,SPHERE_ELLIPSE); gbm.time_kernel2.stop(); @@ -266,7 +269,8 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(), &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, - &vflag, &gbm.last_ellipse, &ainum, &anall); + &vflag, &gbm.last_ellipse, &ainum, &anall, + &gbm._threads_per_atom); gbm.time_gayberne2.stop(); } else { gbm.ans->dev_ans.zero(); @@ -290,7 +294,8 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &stride, &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(), &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), - &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall); + &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall, + &gbm._threads_per_atom); } else { GBMF.k_lj.set_size(GX,BX); GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(), @@ -298,7 +303,8 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm.gamma_upsilon_mu.begin(), &stride, &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(), &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), - &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall); + &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall, + &gbm._threads_per_atom); } } gbm.time_pair.stop(); @@ -315,7 +321,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(), &ainum, &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), - &eflag, &vflag, &ainum, &anall); + &eflag, &vflag, &ainum, &anall, &gbm._threads_per_atom); gbm.time_gayberne.stop(); } } diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h index 6ac390437a..a341940c0a 100644 --- a/lib/gpu/gb_gpu_extra.h +++ b/lib/gpu/gb_gpu_extra.h @@ -18,7 +18,6 @@ #ifndef GB_GPU_EXTRA_H #define GB_GPU_EXTRA_H -#define MAX_SHARED_TYPES 8 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #ifdef _DOUBLE_DOUBLE @@ -47,7 +46,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" #else @@ -58,6 +57,8 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #define BLOCK_SIZE_X get_local_size(0) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) #define __inline inline +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif diff --git a/lib/gpu/gb_gpu_kernel.cu b/lib/gpu/gb_gpu_kernel.cu index b8d06ec6da..d1abd4338b 100644 --- a/lib/gpu/gb_gpu_kernel.cu +++ b/lib/gpu/gb_gpu_kernel.cu @@ -97,17 +97,17 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q, __global acctyp4 *ans, const int astride, __global acctyp *engv, __global int *err_flag, const int eflag, const int vflag, const int inum, - const int nall) { + const int nall, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; - if (ii<4) - sp_lj[ii]=gum[ii+3]; - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); - __syncthreads(); - - if (ii0) + energy+=u_r*temp2; + numtyp temp1 = -eta*u_r*factor_lj; + if (vflag>0) { + r12[0]*=-r; + r12[1]*=-r; + r12[2]*=-r; + numtyp ft=temp1*dchi[0]-temp2*dUr[0]; + f.x+=ft; + virial[0]+=r12[0]*ft; + ft=temp1*dchi[1]-temp2*dUr[1]; + f.y+=ft; + virial[1]+=r12[1]*ft; + virial[3]+=r12[0]*ft; + ft=temp1*dchi[2]-temp2*dUr[2]; + f.z+=ft; + virial[2]+=r12[2]*ft; + virial[4]+=r12[0]*ft; + virial[5]+=r12[1]*ft; + } else { + f.x+=temp1*dchi[0]-temp2*dUr[0]; + f.y+=temp1*dchi[1]-temp2*dUr[1]; + f.z+=temp1*dchi[2]-temp2*dUr[2]; + } + + // Torque on 1 + temp1 = -u_r*eta*factor_lj; + temp2 = -u_r*chi*factor_lj; + numtyp temp3 = -chi*eta*factor_lj; + tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; + tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; + tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; + + } // for nbor + } // if ii - // energy - - // compute u_r and dUr - numtyp uslj_rsq; - { - // Compute distance of closest approach - numtyp h12, sigma12; - sigma12 = gpu_dot3(r12,kappa); - sigma12 = rsqrt((numtyp)0.5*sigma12); - h12 = r-sigma12; + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[7][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=tor.x; + red_acc[4][tid]=tor.y; + red_acc[5][tid]=tor.z; - // -- kappa is now ok - kappa[0]*=r; - kappa[1]*=r; - kappa[2]*=r; - - int mtype=mul24(ntypes,itype)+jtype; - numtyp sigma = sig_eps[mtype].x; - numtyp epsilon = sig_eps[mtype].y; - numtyp varrho = sigma/(h12+gum[0]*sigma); - numtyp varrho6 = varrho*varrho*varrho; - varrho6*=varrho6; - numtyp varrho12 = varrho6*varrho6; - u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); - - numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; - temp1 = temp1*(numtyp)24.0*epsilon; - uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; - numtyp temp2 = gpu_dot3(kappa,r12); - uslj_rsq = uslj_rsq*ir*ir; - - dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]); - dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]); - dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]); - } - - // torque for particle 1 - { - numtyp tempv[3], tempv2[3]; - tempv[0] = -uslj_rsq*kappa[0]; - tempv[1] = -uslj_rsq*kappa[1]; - tempv[2] = -uslj_rsq*kappa[2]; - gpu_row_times3(kappa,g1,tempv2); - gpu_cross3(tempv,tempv2,tUr); - } + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; } } - - // Compute eta - { - eta = (numtyp)2.0*lshape[itype]*lshape[jtype]; - numtyp det_g12 = gpu_det3(g12); - eta = pow(eta/det_g12,gum[1]); - } - // Compute teta - numtyp temp[9], tempv[3], tempv2[3]; - compute_eta_torque(g12,a1,ishape,temp); - numtyp temp1 = -eta*gum[1]; + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + tor.x=red_acc[3][tid]; + tor.y=red_acc[4][tid]; + tor.z=red_acc[5][tid]; - tempv[0] = temp1*temp[0]; - tempv[1] = temp1*temp[1]; - tempv[2] = temp1*temp[2]; - gpu_cross3(a1,tempv,tempv2); - teta[0] = tempv2[0]; - teta[1] = tempv2[1]; - teta[2] = tempv2[2]; - - tempv[0] = temp1*temp[3]; - tempv[1] = temp1*temp[4]; - tempv[2] = temp1*temp[5]; - gpu_cross3(a1+3,tempv,tempv2); - teta[0] += tempv2[0]; - teta[1] += tempv2[1]; - teta[2] += tempv2[2]; + if (eflag>0 || vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + red_acc[6][tid]=energy; - tempv[0] = temp1*temp[6]; - tempv[1] = temp1*temp[7]; - tempv[2] = temp1*temp[8]; - gpu_cross3(a1+6,tempv,tempv2); - teta[0] += tempv2[0]; - teta[1] += tempv2[1]; - teta[2] += tempv2[2]; - } - - numtyp chi, dchi[3], tchi[3]; - { // Compute chi and dchi - - // Compute b12 - numtyp b2[9], b12[9]; - { - gpu_times3(well[jtype],a2,b12); - gpu_transpose_times3(a2,b12,b2); - gpu_plus3(b1,b2,b12); + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<7; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + energy=red_acc[6][tid]; } - - // compute chi_12 - r12[0]*=r; - r12[1]*=r; - r12[2]*=r; - numtyp iota[3]; - gpu_mldivide3(b12,r12,iota,err_flag); - // -- iota is now iota/r - iota[0]*=ir; - iota[1]*=ir; - iota[2]*=ir; - r12[0]*=ir; - r12[1]*=ir; - r12[2]*=ir; - chi = gpu_dot3(r12,iota); - chi = pow(chi*(numtyp)2.0,gum[2]); - - // -- iota is now ok - iota[0]*=r; - iota[1]*=r; - iota[2]*=r; - - numtyp temp1 = gpu_dot3(iota,r12); - numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/ - gum[2]); - dchi[0] = temp2*(iota[0]-temp1*r12[0]); - dchi[1] = temp2*(iota[1]-temp1*r12[1]); - dchi[2] = temp2*(iota[2]-temp1*r12[2]); - - // compute t_chi - numtyp tempv[3]; - gpu_row_times3(iota,b1,tempv); - gpu_cross3(tempv,iota,tchi); - temp1 = (numtyp)-4.0*ir*ir; - tchi[0] *= temp1; - tchi[1] *= temp1; - tchi[2] *= temp1; } - numtyp temp2 = factor_lj*eta*chi; - if (eflag>0) - energy+=u_r*temp2; - numtyp temp1 = -eta*u_r*factor_lj; - if (vflag>0) { - r12[0]*=-r; - r12[1]*=-r; - r12[2]*=-r; - numtyp ft=temp1*dchi[0]-temp2*dUr[0]; - f.x+=ft; - virial[0]+=r12[0]*ft; - ft=temp1*dchi[1]-temp2*dUr[1]; - f.y+=ft; - virial[1]+=r12[1]*ft; - virial[3]+=r12[0]*ft; - ft=temp1*dchi[2]-temp2*dUr[2]; - f.z+=ft; - virial[2]+=r12[2]*ft; - virial[4]+=r12[0]*ft; - virial[5]+=r12[1]*ft; - } else { - f.x+=temp1*dchi[0]-temp2*dUr[0]; - f.y+=temp1*dchi[1]-temp2*dUr[1]; - f.z+=temp1*dchi[2]-temp2*dUr[2]; - } - - // Torque on 1 - temp1 = -u_r*eta*factor_lj; - temp2 = -u_r*chi*factor_lj; - numtyp temp3 = -chi*eta*factor_lj; - tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; - tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; - tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; - - } // for nbor - // Store answers - __global acctyp *ap1=engv+ii; - if (eflag>0) { - *ap1=energy; - ap1+=astride; - } - if (vflag>0) { - for (int i=0; i<6; i++) { - *ap1=virial[i]; + if (ii0) { + *ap1=energy; ap1+=astride; } - } - ans[ii]=f; - ans[ii+astride]=tor; + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=astride; + } + } + ans[ii]=f; + ans[ii+astride]=tor; } // if ii } diff --git a/lib/gpu/gb_gpu_kernel_lj.cu b/lib/gpu/gb_gpu_kernel_lj.cu index 3e42cbcbbc..120cc7e1f6 100644 --- a/lib/gpu/gb_gpu_kernel_lj.cu +++ b/lib/gpu/gb_gpu_kernel_lj.cu @@ -34,33 +34,36 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q, __global acctyp4 *ans, __global acctyp *engv, __global int *err_flag, const int eflag, const int vflag,const int start, const int inum, - const int nall) { - __local numtyp sp_lj[4]; + const int nall, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; - if (ii<4) - sp_lj[ii]=gum[ii+3]; - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start; - __syncthreads(); + __local numtyp sp_lj[4]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -265,39 +307,42 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, __global acctyp4 *ans, __global acctyp *engv, __global int *err_flag, const int eflag, const int vflag, const int start, const int inum, - const int nall) { - __local numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; - if (ii<4) - sp_lj[ii]=gum[ii+3]; - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start; - __syncthreads(); + const int nall, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1+=energy; @@ -366,45 +450,49 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global acctyp *engv, __global int *err_flag, const int eflag,const int vflag, const int start, const int inum, const int nall) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - if (ii<4) - sp_lj[ii]=gum[ii+3]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1+=energy; diff --git a/lib/gpu/gb_gpu_memory.cpp b/lib/gpu/gb_gpu_memory.cpp index c745a08da3..971649c6e8 100644 --- a/lib/gpu/gb_gpu_memory.cpp +++ b/lib/gpu/gb_gpu_memory.cpp @@ -73,6 +73,7 @@ int GB_GPU_MemoryT::init(const int ntypes, const double gamma, if (host_nlocal>0) _gpu_host=1; + _threads_per_atom=device->threads_per_atom(); int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0, _gpu_host,max_nbors,cell_size,true); if (success!=0) diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h index 8a6653170f..010d3b3499 100644 --- a/lib/gpu/gb_gpu_memory.h +++ b/lib/gpu/gb_gpu_memory.h @@ -198,6 +198,7 @@ class GB_GPU_Memory { UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj; inline int block_size() { return _block_size; } + int _threads_per_atom; private: bool _allocated, _compiled; int _block_size; diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu index ac7609ee6c..39a17d89ed 100644 --- a/lib/gpu/lj96_cut_gpu_kernel.cu +++ b/lib/gpu/lj96_cut_gpu_kernel.cu @@ -70,6 +70,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define __inline inline #define fetch_pos(i,y) x_[i] +#define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #endif @@ -81,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -175,49 +230,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const int nall, const int nbor_pitch, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; - if (ii<4) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/lj96_cut_gpu_memory.cpp b/lib/gpu/lj96_cut_gpu_memory.cpp index 3ef61d6861..0b066c0973 100644 --- a/lib/gpu/lj96_cut_gpu_memory.cpp +++ b/lib/gpu/lj96_cut_gpu_memory.cpp @@ -126,7 +126,8 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int anall=this->atom->nall(); @@ -137,16 +138,18 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &anall, &nbor_pitch); + &ainum, &anall, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->ans->dev_ans.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &anall, &nbor_pitch); + &anall, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu index b2f8e85a1f..1eadc7055d 100644 --- a/lib/gpu/lj_cut_gpu_kernel.cu +++ b/lib/gpu/lj_cut_gpu_kernel.cu @@ -70,6 +70,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define __inline inline #define fetch_pos(i,y) x_[i] +#define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #endif @@ -81,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -174,49 +229,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const int nall, const int nbor_pitch, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; - if (ii<4) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/lj_cut_gpu_memory.cpp b/lib/gpu/lj_cut_gpu_memory.cpp index 0d8fd3bd05..a294eb647f 100644 --- a/lib/gpu/lj_cut_gpu_memory.cpp +++ b/lib/gpu/lj_cut_gpu_memory.cpp @@ -126,7 +126,8 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int anall=this->atom->nall(); @@ -137,16 +138,18 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &anall, &nbor_pitch); + &ainum, &anall, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->ans->dev_ans.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &anall, &nbor_pitch); + &anall, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lj_expand_gpu_kernel.cu b/lib/gpu/lj_expand_gpu_kernel.cu index dc835c6b8d..df3f0ff9ae 100644 --- a/lib/gpu/lj_expand_gpu_kernel.cu +++ b/lib/gpu/lj_expand_gpu_kernel.cu @@ -70,6 +70,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define __inline inline #define fetch_pos(i,y) x_[i] +#define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #endif @@ -81,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -177,49 +232,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const int nall, const int nbor_pitch, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; - if (ii<4) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(numtyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/lj_expand_gpu_memory.cpp b/lib/gpu/lj_expand_gpu_memory.cpp index e29f9f8dc7..fe5bf0b513 100644 --- a/lib/gpu/lj_expand_gpu_memory.cpp +++ b/lib/gpu/lj_expand_gpu_memory.cpp @@ -126,7 +126,8 @@ void LJE_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int anall=this->atom->nall(); @@ -137,16 +138,18 @@ void LJE_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &anall, &nbor_pitch); + &ainum, &anall, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->ans->dev_ans.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &anall, &nbor_pitch); + &anall, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu index 1e430d3967..2b2cccb284 100644 --- a/lib/gpu/ljc_cut_gpu_kernel.cu +++ b/lib/gpu/ljc_cut_gpu_kernel.cu @@ -80,6 +80,7 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #endif @@ -91,13 +92,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_ , __global numtyp *cutsq, - const numtyp qqrd2e) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, __global numtyp *q_ , + __global numtyp *cutsq, const numtyp qqrd2e, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -108,29 +113,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -208,54 +266,69 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_ , __global numtyp *_cutsq, - const numtyp qqrd2e) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const numtyp qqrd2e, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/ljc_cut_gpu_memory.cpp b/lib/gpu/ljc_cut_gpu_memory.cpp index bb9a98830e..642ff6ecc7 100644 --- a/lib/gpu/ljc_cut_gpu_memory.cpp +++ b/lib/gpu/ljc_cut_gpu_memory.cpp @@ -138,7 +138,8 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int anall=this->atom->nall(); @@ -149,19 +150,20 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &cutsq.begin(), - &_qqrd2e); + &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->ans->dev_ans.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), - &cutsq.begin(), &_qqrd2e); + &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu index 646cea902e..000ecac616 100644 --- a/lib/gpu/ljcl_cut_gpu_kernel.cu +++ b/lib/gpu/ljcl_cut_gpu_kernel.cu @@ -88,6 +88,7 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #endif @@ -99,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -116,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -224,52 +282,68 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const numtyp qqrd2e, const numtyp g_ewald, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/ljcl_cut_gpu_memory.cpp b/lib/gpu/ljcl_cut_gpu_memory.cpp index 57c7aab626..f37e6b1857 100644 --- a/lib/gpu/ljcl_cut_gpu_memory.cpp +++ b/lib/gpu/ljcl_cut_gpu_memory.cpp @@ -136,7 +136,8 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int anall=this->atom->nall(); @@ -147,19 +148,21 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, - &_qqrd2e, &_g_ewald); + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->ans->dev_ans.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), - &_cut_coulsq, &_qqrd2e, &_g_ewald); + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/morse_gpu_kernel.cu b/lib/gpu/morse_gpu_kernel.cu index 51678b648e..3c483efac2 100644 --- a/lib/gpu/morse_gpu_kernel.cu +++ b/lib/gpu/morse_gpu_kernel.cu @@ -70,6 +70,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define __inline inline #define fetch_pos(i,y) x_[i] +#define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #endif @@ -81,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1, __global numtyp2* mor2, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -175,49 +230,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in, __global numtyp2* mor2_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const int nall, const int nbor_pitch, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 mor1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp2 mor2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; - if (ii<4) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - mor2[ii]=mor2_in[ii]; + mor2[tid]=mor2_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/morse_gpu_memory.cpp b/lib/gpu/morse_gpu_memory.cpp index 3973e2a27d..f146b39215 100644 --- a/lib/gpu/morse_gpu_memory.cpp +++ b/lib/gpu/morse_gpu_memory.cpp @@ -125,7 +125,8 @@ void MOR_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int anall=this->atom->nall(); @@ -136,16 +137,18 @@ void MOR_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &anall, &nbor_pitch); + &ainum, &anall, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(), &_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->ans->dev_ans.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &anall, &nbor_pitch); + &anall, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); }