diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 6f0c7c8433..1b2900f97f 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -515,8 +515,8 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, const numtyp4 pol3j = polar3[j]; numtyp qkyz = pol3j.x; // rpole[j][9]; numtyp qkzz = pol3j.y; // rpole[j][12]; - int jtype = pol3j.z; // amtype[j]; - int jgroup = pol3j.w; // amgroup[j]; + //int jtype = pol3j.z; // amtype[j]; + //int jgroup = pol3j.w; // amgroup[j]; const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)]; numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)]; @@ -546,18 +546,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp dirx = diy*zr - diz*yr; numtyp diry = diz*xr - dix*zr; numtyp dirz = dix*yr - diy*xr; - numtyp dkrx = dky*zr - dkz*yr; - numtyp dkry = dkz*xr - dkx*zr; - numtyp dkrz = dkx*yr - dky*xr; numtyp dikx = diy*dkz - diz*dky; numtyp diky = diz*dkx - dix*dkz; numtyp dikz = dix*dky - diy*dkx; numtyp qirx = qiz*yr - qiy*zr; numtyp qiry = qix*zr - qiz*xr; numtyp qirz = qiy*xr - qix*yr; - numtyp qkrx = qkz*yr - qky*zr; - numtyp qkry = qkx*zr - qkz*xr; - numtyp qkrz = qky*xr - qkx*yr; numtyp qikx = qky*qiz - qkz*qiy; numtyp qiky = qkz*qix - qkx*qiz; numtyp qikz = qkx*qiy - qky*qix; @@ -570,18 +564,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp qikrx = qizk*yr - qiyk*zr; numtyp qikry = qixk*zr - qizk*xr; numtyp qikrz = qiyk*xr - qixk*yr; - numtyp qkirx = qkzi*yr - qkyi*zr; - numtyp qkiry = qkxi*zr - qkzi*xr; - numtyp qkirz = qkyi*xr - qkxi*yr; numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz; numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz; numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz; numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz; numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz; numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz; - numtyp diqkrx = diqkz*yr - diqky*zr; - numtyp diqkry = diqkx*zr - diqkz*xr; - numtyp diqkrz = diqky*xr - diqkx*yr; numtyp dkqirx = dkqiz*yr - dkqiy*zr; numtyp dkqiry = dkqix*zr - dkqiz*xr; numtyp dkqirz = dkqiy*xr - dkqix*yr; @@ -735,7 +723,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); int n_stride; - local_allocate_store_charge(); + //local_allocate_store_charge(); acctyp _fieldp[6]; for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; @@ -751,8 +739,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, n_stride,nbor_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; - //numtyp qtmp; fetch(qtmp,i,q_tex); - //int itype=ix.w; // recalculate numj and nbor_end for use of the short nbor list if (dev_packed==dev_nbor) { @@ -762,21 +748,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, nbor_mem = dev_short_nbor; } - //numtyp bn[4],bcn[3]; - //numtyp fid[3],fip[3]; - - const numtyp4 pol1i = polar1[i]; - numtyp dix = pol1i.y; // rpole[i][1]; - numtyp diy = pol1i.z; // rpole[i][2]; - numtyp diz = pol1i.w; // rpole[i][3]; - const numtyp4 pol2i = polar2[i]; - numtyp qixx = pol2i.x; // rpole[i][4]; - numtyp qixy = pol2i.y; // rpole[i][5]; - numtyp qixz = pol2i.z; // rpole[i][6]; - numtyp qiyy = pol2i.w; // rpole[i][8]; const numtyp4 pol3i = polar3[i]; - numtyp qiyz = pol3i.x; // rpole[i][9]; - numtyp qizz = pol3i.y; // rpole[i][12]; int itype = pol3i.z; // amtype[i]; int igroup = pol3i.w; // amgroup[i]; @@ -843,11 +815,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, // intermediates involving moments and separation distance - numtyp dir = dix*xr + diy*yr + diz*zr; - numtyp qix = qixx*xr + qixy*yr + qixz*zr; - numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; - numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; - numtyp qir = qix*xr + qiy*yr + qiz*zr; numtyp dkr = dkx*xr + dky*yr + dkz*zr; numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; @@ -959,7 +926,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); int n_stride; - local_allocate_store_charge(); + //local_allocate_store_charge(); acctyp _fieldp[6]; for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; @@ -977,8 +944,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, n_stride,nbor_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; - //numtyp qtmp; fetch(qtmp,i,q_tex); - //int itype=ix.w; // recalculate numj and nbor_end for use of the short nbor list if (dev_packed==dev_nbor) { @@ -989,9 +954,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, } int itype,igroup; - //numtyp bn[4],bcn[3]; - //numtyp fid[3],fip[3]; - itype = polar3[i].z; // amtype[i]; igroup = polar3[i].w; // amgroup[i]; @@ -1008,7 +970,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, int j = jextra & NEIGHMASK15; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; - //int jtype=jx.w; // Compute r12 numtyp xr = jx.x - ix.x; @@ -1171,23 +1132,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, if (ii +void BaseAmoebaT::setup_fft(const int size, const int element_type) +{ + +} + +// --------------------------------------------------------------------------- +// Compute FFT on the device // --------------------------------------------------------------------------- template diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index cf767be96e..2bff362f29 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -190,7 +190,12 @@ class BaseAmoeba { _fieldp.update_host(_max_fieldp_size*8,false); } + /// setup a plan for FFT, where size is the number of elements + + void setup_fft(const int size, const int element_type=0); + /// compute forward/backward FFT on the device + void compute_fft1d(void** in, void** out, const int mode); // -------------------------- DEVICE DATA ------------------------- diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp index ad52df3d4b..f514a50620 100644 --- a/src/GPU/amoeba_convolution_gpu.cpp +++ b/src/GPU/amoeba_convolution_gpu.cpp @@ -21,10 +21,12 @@ using namespace LAMMPS_NS; #define SCALE 0 +enum {FORWARD,BACKWARD}; + // External functions from GPU library -int amoeba_setup_fft(const int size); -int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int mode); +int amoeba_setup_fft(const int size, const int element_type); +int amoeba_compute_fft1d(void* in, void* out, const int mode); /* ---------------------------------------------------------------------- partition an FFT grid across processors @@ -64,6 +66,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() debug_scalar(GRIDBRICK_IN,"PRE Convo / POST GridComm"); debug_file(GRIDBRICK_IN,"pre.convo.post.gridcomm"); #endif + // copy owned 4d brick grid values to FFT grid n = 0; @@ -88,6 +91,8 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() fft1->compute(cfft,cfft,FFT3d::FORWARD); + //amoeba_compute_fft1d(cfft,cfft,FORWARD); + if (SCALE) { double scale = 1.0/nfft_global; for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 734ca53bba..29db1b4c1b 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -38,6 +38,7 @@ using namespace LAMMPS_NS; using namespace MathConst; +// same as in amoeba_induce.cpp enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP}; // forward comm enum{FIELD,ZRSD,TORQUE,UFLD}; // reverse comm enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG}; @@ -46,6 +47,7 @@ enum{GEAR,ASPC,LSQR}; enum{BUILD,APPLY}; enum{GORDON1,GORDON2}; +// same as in pair_amoeba.cpp enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC}; #define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye @@ -188,7 +190,6 @@ void PairAmoebaGPU::init_style() ic_kspace = new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC); } - } }