Cleaned up unused variables in the amoeba kernels, made room for convolution gpu

2022-08-16 15:37:49 -05:00
parent 46b8b00a4f
commit 28dabb9687
6 changed files with 34 additions and 67 deletions
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -515,8 +515,8 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      const numtyp4 pol3j = polar3[j];
      numtyp qkyz = pol3j.x; // rpole[j][9];
      numtyp qkzz = pol3j.y; // rpole[j][12];
-      int jtype = pol3j.z; // amtype[j];
-      int jgroup =  pol3j.w; // amgroup[j];
+      //int jtype = pol3j.z; // amtype[j];
+      //int jgroup =  pol3j.w; // amgroup[j];

      const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)];
      numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
@ -546,18 +546,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      numtyp dirx = diy*zr - diz*yr;
      numtyp diry = diz*xr - dix*zr;
      numtyp dirz = dix*yr - diy*xr;
-      numtyp dkrx = dky*zr - dkz*yr;
-      numtyp dkry = dkz*xr - dkx*zr;
-      numtyp dkrz = dkx*yr - dky*xr;
      numtyp dikx = diy*dkz - diz*dky;
      numtyp diky = diz*dkx - dix*dkz;
      numtyp dikz = dix*dky - diy*dkx;
      numtyp qirx = qiz*yr - qiy*zr;
      numtyp qiry = qix*zr - qiz*xr;
      numtyp qirz = qiy*xr - qix*yr;
-      numtyp qkrx = qkz*yr - qky*zr;
-      numtyp qkry = qkx*zr - qkz*xr;
-      numtyp qkrz = qky*xr - qkx*yr;
      numtyp qikx = qky*qiz - qkz*qiy;
      numtyp qiky = qkz*qix - qkx*qiz;
      numtyp qikz = qkx*qiy - qky*qix;
@ -570,18 +564,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      numtyp qikrx = qizk*yr - qiyk*zr;
      numtyp qikry = qixk*zr - qizk*xr;
      numtyp qikrz = qiyk*xr - qixk*yr;
-      numtyp qkirx = qkzi*yr - qkyi*zr;
-      numtyp qkiry = qkxi*zr - qkzi*xr;
-      numtyp qkirz = qkyi*xr - qkxi*yr;
      numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
      numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
      numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
      numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
      numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
      numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
-      numtyp diqkrx = diqkz*yr - diqky*zr;
-      numtyp diqkry = diqkx*zr - diqkz*xr;
-      numtyp diqkrz = diqky*xr - diqkx*yr;
      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
      numtyp dkqiry = dkqix*zr - dkqiz*xr;
      numtyp dkqirz = dkqiy*xr - dkqix*yr;
@ -735,7 +723,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  int n_stride;
-  local_allocate_store_charge();
+  //local_allocate_store_charge();

  acctyp _fieldp[6];
  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
@ -751,8 +739,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
@ -762,21 +748,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
      nbor_mem = dev_short_nbor;
    }

-    //numtyp bn[4],bcn[3];
-    //numtyp fid[3],fip[3];
-
-    const numtyp4 pol1i = polar1[i];
-    numtyp dix = pol1i.y;    // rpole[i][1];
-    numtyp diy = pol1i.z;    // rpole[i][2];
-    numtyp diz = pol1i.w;    // rpole[i][3];
-    const numtyp4 pol2i = polar2[i];
-    numtyp qixx = pol2i.x;   // rpole[i][4];
-    numtyp qixy = pol2i.y;   // rpole[i][5];
-    numtyp qixz = pol2i.z;   // rpole[i][6];
-    numtyp qiyy = pol2i.w;   // rpole[i][8];
    const numtyp4 pol3i = polar3[i];
-    numtyp qiyz = pol3i.x;   // rpole[i][9];
-    numtyp qizz = pol3i.y;   // rpole[i][12];
    int itype  = pol3i.z;    // amtype[i];
    int igroup = pol3i.w;    // amgroup[i];

@ -843,11 +815,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,

      // intermediates involving moments and separation distance

-      numtyp dir = dix*xr + diy*yr + diz*zr;
-      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
-      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
-      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
-      numtyp qir = qix*xr + qiy*yr + qiz*zr;
      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
@ -959,7 +926,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  int n_stride;
-  local_allocate_store_charge();
+  //local_allocate_store_charge();

  acctyp _fieldp[6];
  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
@ -977,8 +944,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
@ -989,9 +954,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
    }

    int itype,igroup;
-    //numtyp bn[4],bcn[3];
-    //numtyp fid[3],fip[3];
-
    itype  = polar3[i].z; // amtype[i];
    igroup = polar3[i].w; // amgroup[i];

@ -1008,7 +970,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
      int j = jextra & NEIGHMASK15;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;

      // Compute r12
      numtyp xr = jx.x - ix.x;
@ -1171,23 +1132,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int itype,igroup;
-    /*
-    numtyp bfac;
-    numtyp psc3,psc5,psc7;
-    numtyp dsc3,dsc5,dsc7;
-    numtyp usc3,usc5;
-    numtyp psr3,psr5,psr7;
-    numtyp dsr3,dsr5,dsr7;
-    numtyp usr5;
-    numtyp term1,term2,term3;
-    numtyp term4,term5;
-    numtyp term6,term7;
-    numtyp rc3[3],rc5[3],rc7[3];
-    numtyp prc3[3],prc5[3],prc7[3];
-    numtyp drc3[3],drc5[3],drc7[3];
-    numtyp urc3[3],urc5[3];
-    numtyp bn[5];
-    */
    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;

    int numj, nbor, nbor_end;
@ -1196,8 +1140,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
@ -1303,7 +1245,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
      numtyp uir = uix*xr + uiy*yr + uiz*zr;
-      numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
+      //numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
      numtyp ukr = ukx*xr + uky*yr + ukz*zr;
      numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;

--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@ -162,6 +162,10 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                              eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }

+void amoeba_setup_fft(const int size, const int element_type) {
+  AMOEBAMF.setup_fft(size, element_type);
+}
+
 void amoeba_compute_fft1d(void** in, void** out, const int mode) {
  AMOEBAMF.compute_fft1d(in, out, mode);
 }
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -579,7 +579,17 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
 }

 // ---------------------------------------------------------------------------
-// Compute FFT
+// Setup the FFT plan
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::setup_fft(const int size, const int element_type)
+{
+
+}
+
+// ---------------------------------------------------------------------------
+// Compute FFT on the device
 // ---------------------------------------------------------------------------

 template <class numtyp, class acctyp>
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -190,7 +190,12 @@ class BaseAmoeba {
    _fieldp.update_host(_max_fieldp_size*8,false);
  }

+  /// setup a plan for FFT, where size is the number of elements
+
+  void setup_fft(const int size, const int element_type=0);
+
  /// compute forward/backward FFT on the device
+
  void compute_fft1d(void** in, void** out, const int mode);

  // -------------------------- DEVICE DATA -------------------------
--- a/src/GPU/amoeba_convolution_gpu.cpp
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@ -21,10 +21,12 @@ using namespace LAMMPS_NS;

 #define SCALE 0

+enum {FORWARD,BACKWARD};
+
 // External functions from GPU library

-int amoeba_setup_fft(const int size);
-int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int mode);
+int amoeba_setup_fft(const int size, const int element_type);
+int amoeba_compute_fft1d(void* in, void* out, const int mode);

 /* ----------------------------------------------------------------------
   partition an FFT grid across processors
@ -64,6 +66,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
  debug_scalar(GRIDBRICK_IN,"PRE Convo / POST GridComm");
  debug_file(GRIDBRICK_IN,"pre.convo.post.gridcomm");
 #endif
+
  // copy owned 4d brick grid values to FFT grid

  n = 0;
@ -88,6 +91,8 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()

  fft1->compute(cfft,cfft,FFT3d::FORWARD);

+  //amoeba_compute_fft1d(cfft,cfft,FORWARD);
+
  if (SCALE) {
    double scale = 1.0/nfft_global;
    for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -38,6 +38,7 @@
 using namespace LAMMPS_NS;
 using namespace MathConst;

+// same as in amoeba_induce.cpp
 enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP};   // forward comm
 enum{FIELD,ZRSD,TORQUE,UFLD};                               // reverse comm
 enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
@ -46,6 +47,7 @@ enum{GEAR,ASPC,LSQR};
 enum{BUILD,APPLY};
 enum{GORDON1,GORDON2};

+// same as in pair_amoeba.cpp
 enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};

 #define DEBYE 4.80321    // conversion factor from q-Angs (real units) to Debye
@ -188,7 +190,6 @@ void PairAmoebaGPU::init_style()
      ic_kspace =
        new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
    }
-
  }
 }