Cleaned up unused variables in the amoeba kernels, made room for convolution gpu
This commit is contained in:
@ -515,8 +515,8 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
||||
const numtyp4 pol3j = polar3[j];
|
||||
numtyp qkyz = pol3j.x; // rpole[j][9];
|
||||
numtyp qkzz = pol3j.y; // rpole[j][12];
|
||||
int jtype = pol3j.z; // amtype[j];
|
||||
int jgroup = pol3j.w; // amgroup[j];
|
||||
//int jtype = pol3j.z; // amtype[j];
|
||||
//int jgroup = pol3j.w; // amgroup[j];
|
||||
|
||||
const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)];
|
||||
numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
|
||||
@ -546,18 +546,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
||||
numtyp dirx = diy*zr - diz*yr;
|
||||
numtyp diry = diz*xr - dix*zr;
|
||||
numtyp dirz = dix*yr - diy*xr;
|
||||
numtyp dkrx = dky*zr - dkz*yr;
|
||||
numtyp dkry = dkz*xr - dkx*zr;
|
||||
numtyp dkrz = dkx*yr - dky*xr;
|
||||
numtyp dikx = diy*dkz - diz*dky;
|
||||
numtyp diky = diz*dkx - dix*dkz;
|
||||
numtyp dikz = dix*dky - diy*dkx;
|
||||
numtyp qirx = qiz*yr - qiy*zr;
|
||||
numtyp qiry = qix*zr - qiz*xr;
|
||||
numtyp qirz = qiy*xr - qix*yr;
|
||||
numtyp qkrx = qkz*yr - qky*zr;
|
||||
numtyp qkry = qkx*zr - qkz*xr;
|
||||
numtyp qkrz = qky*xr - qkx*yr;
|
||||
numtyp qikx = qky*qiz - qkz*qiy;
|
||||
numtyp qiky = qkz*qix - qkx*qiz;
|
||||
numtyp qikz = qkx*qiy - qky*qix;
|
||||
@ -570,18 +564,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
||||
numtyp qikrx = qizk*yr - qiyk*zr;
|
||||
numtyp qikry = qixk*zr - qizk*xr;
|
||||
numtyp qikrz = qiyk*xr - qixk*yr;
|
||||
numtyp qkirx = qkzi*yr - qkyi*zr;
|
||||
numtyp qkiry = qkxi*zr - qkzi*xr;
|
||||
numtyp qkirz = qkyi*xr - qkxi*yr;
|
||||
numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
|
||||
numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
|
||||
numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
|
||||
numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
|
||||
numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
|
||||
numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
|
||||
numtyp diqkrx = diqkz*yr - diqky*zr;
|
||||
numtyp diqkry = diqkx*zr - diqkz*xr;
|
||||
numtyp diqkrz = diqky*xr - diqkx*yr;
|
||||
numtyp dkqirx = dkqiz*yr - dkqiy*zr;
|
||||
numtyp dkqiry = dkqix*zr - dkqiz*xr;
|
||||
numtyp dkqirz = dkqiy*xr - dkqix*yr;
|
||||
@ -735,7 +723,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
//local_allocate_store_charge();
|
||||
|
||||
acctyp _fieldp[6];
|
||||
for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
|
||||
@ -751,8 +739,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
//numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
//int itype=ix.w;
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
@ -762,21 +748,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
//numtyp bn[4],bcn[3];
|
||||
//numtyp fid[3],fip[3];
|
||||
|
||||
const numtyp4 pol1i = polar1[i];
|
||||
numtyp dix = pol1i.y; // rpole[i][1];
|
||||
numtyp diy = pol1i.z; // rpole[i][2];
|
||||
numtyp diz = pol1i.w; // rpole[i][3];
|
||||
const numtyp4 pol2i = polar2[i];
|
||||
numtyp qixx = pol2i.x; // rpole[i][4];
|
||||
numtyp qixy = pol2i.y; // rpole[i][5];
|
||||
numtyp qixz = pol2i.z; // rpole[i][6];
|
||||
numtyp qiyy = pol2i.w; // rpole[i][8];
|
||||
const numtyp4 pol3i = polar3[i];
|
||||
numtyp qiyz = pol3i.x; // rpole[i][9];
|
||||
numtyp qizz = pol3i.y; // rpole[i][12];
|
||||
int itype = pol3i.z; // amtype[i];
|
||||
int igroup = pol3i.w; // amgroup[i];
|
||||
|
||||
@ -843,11 +815,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
|
||||
|
||||
// intermediates involving moments and separation distance
|
||||
|
||||
numtyp dir = dix*xr + diy*yr + diz*zr;
|
||||
numtyp qix = qixx*xr + qixy*yr + qixz*zr;
|
||||
numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
|
||||
numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
|
||||
numtyp qir = qix*xr + qiy*yr + qiz*zr;
|
||||
numtyp dkr = dkx*xr + dky*yr + dkz*zr;
|
||||
numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
|
||||
numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
|
||||
@ -959,7 +926,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
//local_allocate_store_charge();
|
||||
|
||||
acctyp _fieldp[6];
|
||||
for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
|
||||
@ -977,8 +944,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
//numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
//int itype=ix.w;
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
@ -989,9 +954,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
int itype,igroup;
|
||||
//numtyp bn[4],bcn[3];
|
||||
//numtyp fid[3],fip[3];
|
||||
|
||||
itype = polar3[i].z; // amtype[i];
|
||||
igroup = polar3[i].w; // amgroup[i];
|
||||
|
||||
@ -1008,7 +970,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
|
||||
int j = jextra & NEIGHMASK15;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
//int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp xr = jx.x - ix.x;
|
||||
@ -1171,23 +1132,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int itype,igroup;
|
||||
/*
|
||||
numtyp bfac;
|
||||
numtyp psc3,psc5,psc7;
|
||||
numtyp dsc3,dsc5,dsc7;
|
||||
numtyp usc3,usc5;
|
||||
numtyp psr3,psr5,psr7;
|
||||
numtyp dsr3,dsr5,dsr7;
|
||||
numtyp usr5;
|
||||
numtyp term1,term2,term3;
|
||||
numtyp term4,term5;
|
||||
numtyp term6,term7;
|
||||
numtyp rc3[3],rc5[3],rc7[3];
|
||||
numtyp prc3[3],prc5[3],prc7[3];
|
||||
numtyp drc3[3],drc5[3],drc7[3];
|
||||
numtyp urc3[3],urc5[3];
|
||||
numtyp bn[5];
|
||||
*/
|
||||
numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
|
||||
|
||||
int numj, nbor, nbor_end;
|
||||
@ -1196,8 +1140,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
//numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
//int itype=ix.w;
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
@ -1303,7 +1245,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
||||
numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
|
||||
numtyp qkr = qkx*xr + qky*yr + qkz*zr;
|
||||
numtyp uir = uix*xr + uiy*yr + uiz*zr;
|
||||
numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
|
||||
//numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
|
||||
numtyp ukr = ukx*xr + uky*yr + ukz*zr;
|
||||
numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;
|
||||
|
||||
|
||||
@ -162,6 +162,10 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
|
||||
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
||||
}
|
||||
|
||||
void amoeba_setup_fft(const int size, const int element_type) {
|
||||
AMOEBAMF.setup_fft(size, element_type);
|
||||
}
|
||||
|
||||
void amoeba_compute_fft1d(void** in, void** out, const int mode) {
|
||||
AMOEBAMF.compute_fft1d(in, out, mode);
|
||||
}
|
||||
|
||||
@ -579,7 +579,17 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Compute FFT
|
||||
// Setup the FFT plan
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::setup_fft(const int size, const int element_type)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Compute FFT on the device
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
||||
@ -190,7 +190,12 @@ class BaseAmoeba {
|
||||
_fieldp.update_host(_max_fieldp_size*8,false);
|
||||
}
|
||||
|
||||
/// setup a plan for FFT, where size is the number of elements
|
||||
|
||||
void setup_fft(const int size, const int element_type=0);
|
||||
|
||||
/// compute forward/backward FFT on the device
|
||||
|
||||
void compute_fft1d(void** in, void** out, const int mode);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
@ -21,10 +21,12 @@ using namespace LAMMPS_NS;
|
||||
|
||||
#define SCALE 0
|
||||
|
||||
enum {FORWARD,BACKWARD};
|
||||
|
||||
// External functions from GPU library
|
||||
|
||||
int amoeba_setup_fft(const int size);
|
||||
int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int mode);
|
||||
int amoeba_setup_fft(const int size, const int element_type);
|
||||
int amoeba_compute_fft1d(void* in, void* out, const int mode);
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
partition an FFT grid across processors
|
||||
@ -64,6 +66,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
|
||||
debug_scalar(GRIDBRICK_IN,"PRE Convo / POST GridComm");
|
||||
debug_file(GRIDBRICK_IN,"pre.convo.post.gridcomm");
|
||||
#endif
|
||||
|
||||
// copy owned 4d brick grid values to FFT grid
|
||||
|
||||
n = 0;
|
||||
@ -88,6 +91,8 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
|
||||
|
||||
fft1->compute(cfft,cfft,FFT3d::FORWARD);
|
||||
|
||||
//amoeba_compute_fft1d(cfft,cfft,FORWARD);
|
||||
|
||||
if (SCALE) {
|
||||
double scale = 1.0/nfft_global;
|
||||
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
|
||||
|
||||
@ -38,6 +38,7 @@
|
||||
using namespace LAMMPS_NS;
|
||||
using namespace MathConst;
|
||||
|
||||
// same as in amoeba_induce.cpp
|
||||
enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP}; // forward comm
|
||||
enum{FIELD,ZRSD,TORQUE,UFLD}; // reverse comm
|
||||
enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
|
||||
@ -46,6 +47,7 @@ enum{GEAR,ASPC,LSQR};
|
||||
enum{BUILD,APPLY};
|
||||
enum{GORDON1,GORDON2};
|
||||
|
||||
// same as in pair_amoeba.cpp
|
||||
enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};
|
||||
|
||||
#define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye
|
||||
@ -188,7 +190,6 @@ void PairAmoebaGPU::init_style()
|
||||
ic_kspace =
|
||||
new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user