Replaced mem allocation/deallocation inside moduli() with using member variables and mem resize if needed

2022-09-18 16:28:30 -05:00
parent caa66d904e
commit 356c46c913
4 changed files with 259 additions and 57 deletions
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -1922,6 +1922,185 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
  }
 }

+
+/* ----------------------------------------------------------------------
+   fphi_mpole = multipole potential from grid
+   fphi_mpole extracts the permanent multipole potential from
+   the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
+                          const __global int *restrict igrid,
+                          const __global numtyp *restrict grid,
+                          __global numtyp *restrict fphi,
+                          const int bsorder, const int inum,
+                          const int nzlo_out, const int nylo_out,
+                          const int nxlo_out, const int ngridxy,
+                          const int ngridx)
+{
+  int tid=THREAD_ID_X;
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
+
+  if (ii<inum) {
+
+    int nlpts = (bsorder-1) / 2;
+    
+    int istart = fast_mul(ii,4);
+    int igridx = igrid[istart];
+    int igridy = igrid[istart+1];
+    int igridz = igrid[istart+2];
+    
+    // now istart is used to index thetai1, thetai2 and thetai3
+    istart = fast_mul(ii,bsorder);
+
+    // extract the permanent multipole field at each site
+
+    numtyp tuv000 = (numtyp)0.0;
+    numtyp tuv001 = (numtyp)0.0;
+    numtyp tuv010 = (numtyp)0.0;
+    numtyp tuv100 = (numtyp)0.0;
+    numtyp tuv200 = (numtyp)0.0;
+    numtyp tuv020 = (numtyp)0.0;
+    numtyp tuv002 = (numtyp)0.0;
+    numtyp tuv110 = (numtyp)0.0;
+    numtyp tuv101 = (numtyp)0.0;
+    numtyp tuv011 = (numtyp)0.0;
+    numtyp tuv300 = (numtyp)0.0;
+    numtyp tuv030 = (numtyp)0.0;
+    numtyp tuv003 = (numtyp)0.0;
+    numtyp tuv210 = (numtyp)0.0;
+    numtyp tuv201 = (numtyp)0.0;
+    numtyp tuv120 = (numtyp)0.0;
+    numtyp tuv021 = (numtyp)0.0;
+    numtyp tuv102 = (numtyp)0.0;
+    numtyp tuv012 = (numtyp)0.0;
+    numtyp tuv111 = (numtyp)0.0;
+
+    int k = (igridz - nzlo_out) - nlpts;
+    for (int kb = 0; kb < bsorder; kb++) {
+      /*
+      v0 = thetai3[m][kb][0];
+      v1 = thetai3[m][kb][1];
+      v2 = thetai3[m][kb][2];
+      v3 = thetai3[m][kb][3];
+      */
+      int i3 = istart + kb;
+      numtyp4 tha3 = thetai3[i3];
+      numtyp v0 = tha3.x;
+      numtyp v1 = tha3.y;
+      numtyp v2 = tha3.z;
+      numtyp v3 = tha3.w;
+      numtyp tu00 = (numtyp)0.0;
+      numtyp tu10 = (numtyp)0.0;
+      numtyp tu01 = (numtyp)0.0;
+      numtyp tu20 = (numtyp)0.0;
+      numtyp tu11 = (numtyp)0.0;
+      numtyp tu02 = (numtyp)0.0;
+      numtyp tu30 = (numtyp)0.0;
+      numtyp tu21 = (numtyp)0.0;
+      numtyp tu12 = (numtyp)0.0;
+      numtyp tu03 = (numtyp)0.0;
+
+      int j = (igridy - nylo_out) - nlpts;
+      for (int jb = 0; jb < bsorder; jb++) {
+        /*
+        u0 = thetai2[m][jb][0];
+        u1 = thetai2[m][jb][1];
+        u2 = thetai2[m][jb][2];
+        u3 = thetai2[m][jb][3];
+        */
+        int i2 = istart + jb;
+        numtyp4 tha2 = thetai2[i2];
+        numtyp u0 = tha2.x;
+        numtyp u1 = tha2.y;
+        numtyp u2 = tha2.z;
+        numtyp u3 = tha2.w;
+        numtyp t0 = (numtyp)0.0;
+        numtyp t1 = (numtyp)0.0;
+        numtyp t2 = (numtyp)0.0;
+        numtyp t3 = (numtyp)0.0;
+
+        int i = (igridx - nxlo_out) - nlpts;
+        for (int ib = 0; ib < bsorder; ib++) {
+          int i1 = istart + ib;
+          numtyp4 tha1 = thetai1[i1];
+          int gidx = 2*(k*ngridxy + j*ngridx + i);
+          numtyp tq = grid[gidx];
+          t0 += tq*tha1.x;
+          t1 += tq*tha1.y;
+          t2 += tq*tha1.z;
+          t3 += tq*tha1.w;
+          i++;
+        }
+
+        tu00 += t0*u0;
+        tu10 += t1*u0;
+        tu01 += t0*u1;
+        tu20 += t2*u0;
+        tu11 += t1*u1;
+        tu02 += t0*u2;
+        tu30 += t3*u0;
+        tu21 += t2*u1;
+        tu12 += t1*u2;
+        tu03 += t0*u3;
+        j++;
+      }
+
+      tuv000 += tu00*v0;
+      tuv100 += tu10*v0;
+      tuv010 += tu01*v0;
+      tuv001 += tu00*v1;
+      tuv200 += tu20*v0;
+      tuv020 += tu02*v0;
+      tuv002 += tu00*v2;
+      tuv110 += tu11*v0;
+      tuv101 += tu10*v1;
+      tuv011 += tu01*v1;
+      tuv300 += tu30*v0;
+      tuv030 += tu03*v0;
+      tuv003 += tu00*v3;
+      tuv210 += tu21*v0;
+      tuv201 += tu20*v1;
+      tuv120 += tu12*v0;
+      tuv021 += tu02*v1;
+      tuv102 += tu10*v2;
+      tuv012 += tu01*v2;
+      tuv111 += tu11*v1;
+      k++;
+    }
+
+    numtyp buf[20];
+    buf[0] = tuv000;
+    buf[1] = tuv100;
+    buf[2] = tuv010;
+    buf[3] = tuv001;
+    buf[4] = tuv200;
+    buf[5] = tuv020;
+    buf[6] = tuv002;
+    buf[7] = tuv110;
+    buf[8] = tuv101;
+    buf[9] = tuv011;
+    buf[10] = tuv300;
+    buf[11] = tuv030;
+    buf[12] = tuv003;
+    buf[13] = tuv210;
+    buf[14] = tuv201;
+    buf[15] = tuv120;
+    buf[16] = tuv021;
+    buf[17] = tuv102;
+    buf[18] = tuv012;
+    buf[19] = tuv111;
+
+    int idx = ii;    
+    for (int m = 0; m < 20; m++) {
+      fphi[idx] = buf[m];
+      idx += inum;
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
   scan standard neighbor list and make it compatible with 1-5 neighbors
   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15