Cleaned up unused variables in the amoeba kernels, made room for convolution gpu

This commit is contained in:
Trung Nguyen
2022-08-16 15:37:49 -05:00
parent 46b8b00a4f
commit 28dabb9687
6 changed files with 34 additions and 67 deletions

View File

@ -515,8 +515,8 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
const numtyp4 pol3j = polar3[j];
numtyp qkyz = pol3j.x; // rpole[j][9];
numtyp qkzz = pol3j.y; // rpole[j][12];
int jtype = pol3j.z; // amtype[j];
int jgroup = pol3j.w; // amgroup[j];
//int jtype = pol3j.z; // amtype[j];
//int jgroup = pol3j.w; // amgroup[j];
const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)];
numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
@ -546,18 +546,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
numtyp dirx = diy*zr - diz*yr;
numtyp diry = diz*xr - dix*zr;
numtyp dirz = dix*yr - diy*xr;
numtyp dkrx = dky*zr - dkz*yr;
numtyp dkry = dkz*xr - dkx*zr;
numtyp dkrz = dkx*yr - dky*xr;
numtyp dikx = diy*dkz - diz*dky;
numtyp diky = diz*dkx - dix*dkz;
numtyp dikz = dix*dky - diy*dkx;
numtyp qirx = qiz*yr - qiy*zr;
numtyp qiry = qix*zr - qiz*xr;
numtyp qirz = qiy*xr - qix*yr;
numtyp qkrx = qkz*yr - qky*zr;
numtyp qkry = qkx*zr - qkz*xr;
numtyp qkrz = qky*xr - qkx*yr;
numtyp qikx = qky*qiz - qkz*qiy;
numtyp qiky = qkz*qix - qkx*qiz;
numtyp qikz = qkx*qiy - qky*qix;
@ -570,18 +564,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
numtyp qikrx = qizk*yr - qiyk*zr;
numtyp qikry = qixk*zr - qizk*xr;
numtyp qikrz = qiyk*xr - qixk*yr;
numtyp qkirx = qkzi*yr - qkyi*zr;
numtyp qkiry = qkxi*zr - qkzi*xr;
numtyp qkirz = qkyi*xr - qkxi*yr;
numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
numtyp diqkrx = diqkz*yr - diqky*zr;
numtyp diqkry = diqkx*zr - diqkz*xr;
numtyp diqkrz = diqky*xr - diqkx*yr;
numtyp dkqirx = dkqiz*yr - dkqiy*zr;
numtyp dkqiry = dkqix*zr - dkqiz*xr;
numtyp dkqirz = dkqiy*xr - dkqix*yr;
@ -735,7 +723,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset);
int n_stride;
local_allocate_store_charge();
//local_allocate_store_charge();
acctyp _fieldp[6];
for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
@ -751,8 +739,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//numtyp qtmp; fetch(qtmp,i,q_tex);
//int itype=ix.w;
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
@ -762,21 +748,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
nbor_mem = dev_short_nbor;
}
//numtyp bn[4],bcn[3];
//numtyp fid[3],fip[3];
const numtyp4 pol1i = polar1[i];
numtyp dix = pol1i.y; // rpole[i][1];
numtyp diy = pol1i.z; // rpole[i][2];
numtyp diz = pol1i.w; // rpole[i][3];
const numtyp4 pol2i = polar2[i];
numtyp qixx = pol2i.x; // rpole[i][4];
numtyp qixy = pol2i.y; // rpole[i][5];
numtyp qixz = pol2i.z; // rpole[i][6];
numtyp qiyy = pol2i.w; // rpole[i][8];
const numtyp4 pol3i = polar3[i];
numtyp qiyz = pol3i.x; // rpole[i][9];
numtyp qizz = pol3i.y; // rpole[i][12];
int itype = pol3i.z; // amtype[i];
int igroup = pol3i.w; // amgroup[i];
@ -843,11 +815,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
// intermediates involving moments and separation distance
numtyp dir = dix*xr + diy*yr + diz*zr;
numtyp qix = qixx*xr + qixy*yr + qixz*zr;
numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
numtyp qir = qix*xr + qiy*yr + qiz*zr;
numtyp dkr = dkx*xr + dky*yr + dkz*zr;
numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
@ -959,7 +926,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset);
int n_stride;
local_allocate_store_charge();
//local_allocate_store_charge();
acctyp _fieldp[6];
for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
@ -977,8 +944,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//numtyp qtmp; fetch(qtmp,i,q_tex);
//int itype=ix.w;
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
@ -989,9 +954,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
}
int itype,igroup;
//numtyp bn[4],bcn[3];
//numtyp fid[3],fip[3];
itype = polar3[i].z; // amtype[i];
igroup = polar3[i].w; // amgroup[i];
@ -1008,7 +970,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
int j = jextra & NEIGHMASK15;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int jtype=jx.w;
// Compute r12
numtyp xr = jx.x - ix.x;
@ -1171,23 +1132,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
if (ii<inum) {
int itype,igroup;
/*
numtyp bfac;
numtyp psc3,psc5,psc7;
numtyp dsc3,dsc5,dsc7;
numtyp usc3,usc5;
numtyp psr3,psr5,psr7;
numtyp dsr3,dsr5,dsr7;
numtyp usr5;
numtyp term1,term2,term3;
numtyp term4,term5;
numtyp term6,term7;
numtyp rc3[3],rc5[3],rc7[3];
numtyp prc3[3],prc5[3],prc7[3];
numtyp drc3[3],drc5[3],drc7[3];
numtyp urc3[3],urc5[3];
numtyp bn[5];
*/
numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
int numj, nbor, nbor_end;
@ -1196,8 +1140,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//numtyp qtmp; fetch(qtmp,i,q_tex);
//int itype=ix.w;
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
@ -1303,7 +1245,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
numtyp qkr = qkx*xr + qky*yr + qkz*zr;
numtyp uir = uix*xr + uiy*yr + uiz*zr;
numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
//numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
numtyp ukr = ukx*xr + uky*yr + ukz*zr;
numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;

View File

@ -162,6 +162,10 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
}
void amoeba_setup_fft(const int size, const int element_type) {
AMOEBAMF.setup_fft(size, element_type);
}
void amoeba_compute_fft1d(void** in, void** out, const int mode) {
AMOEBAMF.compute_fft1d(in, out, mode);
}

View File

@ -579,7 +579,17 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
}
// ---------------------------------------------------------------------------
// Compute FFT
// Setup the FFT plan
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::setup_fft(const int size, const int element_type)
{
}
// ---------------------------------------------------------------------------
// Compute FFT on the device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>

View File

@ -190,7 +190,12 @@ class BaseAmoeba {
_fieldp.update_host(_max_fieldp_size*8,false);
}
/// setup a plan for FFT, where size is the number of elements
void setup_fft(const int size, const int element_type=0);
/// compute forward/backward FFT on the device
void compute_fft1d(void** in, void** out, const int mode);
// -------------------------- DEVICE DATA -------------------------

View File

@ -21,10 +21,12 @@ using namespace LAMMPS_NS;
#define SCALE 0
enum {FORWARD,BACKWARD};
// External functions from GPU library
int amoeba_setup_fft(const int size);
int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int mode);
int amoeba_setup_fft(const int size, const int element_type);
int amoeba_compute_fft1d(void* in, void* out, const int mode);
/* ----------------------------------------------------------------------
partition an FFT grid across processors
@ -64,6 +66,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
debug_scalar(GRIDBRICK_IN,"PRE Convo / POST GridComm");
debug_file(GRIDBRICK_IN,"pre.convo.post.gridcomm");
#endif
// copy owned 4d brick grid values to FFT grid
n = 0;
@ -88,6 +91,8 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
fft1->compute(cfft,cfft,FFT3d::FORWARD);
//amoeba_compute_fft1d(cfft,cfft,FORWARD);
if (SCALE) {
double scale = 1.0/nfft_global;
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;

View File

@ -38,6 +38,7 @@
using namespace LAMMPS_NS;
using namespace MathConst;
// same as in amoeba_induce.cpp
enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP}; // forward comm
enum{FIELD,ZRSD,TORQUE,UFLD}; // reverse comm
enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
@ -46,6 +47,7 @@ enum{GEAR,ASPC,LSQR};
enum{BUILD,APPLY};
enum{GORDON1,GORDON2};
// same as in pair_amoeba.cpp
enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};
#define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye
@ -188,7 +190,6 @@ void PairAmoebaGPU::init_style()
ic_kspace =
new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
}
}
}