Feb2021 GPU Package Update - GPU Package Files

2021-02-15 08:20:50 -08:00
parent 16004e8f45
commit e7e2d2323b
345 changed files with 13424 additions and 7708 deletions
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@ -40,170 +40,521 @@
    nbor_begin+=offset;                                                      \
  }

-#if (ARCH < 300)
+#define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
+                    i, numj, stride, nbor_end, nbor_begin)                   \
+    i=nbor_mem[ii];                                                          \
+    nbor_begin=ii+nbor_stride;                                               \
+    numj=nbor_mem[nbor_begin];                                               \
+    nbor_begin+=nbor_stride+ii*(t_per_atom-1);                               \
+    stride=fast_mul(t_per_atom,nbor_stride);                                 \
+    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj &             \
+                                                          (t_per_atom-1));   \
+    nbor_begin+=offset;

-#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
+#if (SHUFFLE_AVAIL == 0)
+
+#define simd_reduce_add1(width, local, offset, tid, one)                    \
+  local[0][tid]=one;                                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) local[0][tid] += local[0][tid+s];                       \
+  }                                                                         \
+  if (offset==0) one=local[0][tid];
+
+#define simd_reduce_add2(width, local, offset, tid, one, two)               \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
+    one=local[0][tid];                                                      \
+    two=local[1][tid];                                                      \
+  }
+
+#define simd_reduce_add3(width, local, offset, tid, one, two, three)        \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  local[2][tid]=three;                                                      \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
+      local[2][tid] += local[2][tid+s];                                     \
    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
-        ei+=inum;                                                           \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    one=local[0][tid];                                                      \
+    two=local[1][tid];                                                      \
+    three=local[2][tid];                                                    \
+  }
+
+#define simd_reduce_add6(width, local, offset, tid, one, two, three,        \
+                         four, five, six)                                   \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  local[2][tid]=three;                                                      \
+  local[3][tid]=four;                                                       \
+  local[4][tid]=five;                                                       \
+  local[5][tid]=six;                                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
+      local[2][tid] += local[2][tid+s];                                     \
+      local[3][tid] += local[3][tid+s];                                     \
+      local[4][tid] += local[4][tid+s];                                     \
+      local[5][tid] += local[5][tid+s];                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    one=local[0][tid];                                                      \
+    two=local[1][tid];                                                      \
+    three=local[2][tid];                                                    \
+    four=local[3][tid];                                                     \
+    five=local[4][tid];                                                     \
+    six=local[5][tid];                                                      \
+  }
+
+#define simd_reduce_arr(trip, width, local, offset, tid, arr)               \
+  for (int r=0; r<trip; r++)                                                \
+    local[r][tid]=arr[r];                                                   \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      for (int r=0; r<trip; r++)                                            \
+        local[r][tid] += local[r][tid+s];                                   \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    for (int r=0; r<trip; r++)                                              \
+      arr[r]=local[r][tid];                                                 \
+  }
+
+#define block_reduce_add1(width, local, tid, one)                           \
+  local[0][tid]=one;                                                        \
+  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
+    __syncthreads();                                                        \
+    if (tid < s) local[0][tid] += local[0][tid+s];                          \
+  }                                                                         \
+  if (tid<width) {                                                          \
+    for (unsigned int s=width/2; s>0; s>>=1) {                              \
+      simdsync();                                                           \
+      if (tid < s) local[0][tid] += local[0][tid+s];                        \
+    }                                                                       \
+    if (tid==0) one=local[0][tid];                                          \
+  }
+
+#define block_reduce_add2(width, local, tid, one, two)                      \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
+    __syncthreads();                                                        \
+    if (tid < s) {                                                          \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
+    }                                                                       \
+  }                                                                         \
+  if (tid<width) {                                                          \
+    for (unsigned int s=width/2; s>0; s>>=1) {                              \
+      simdsync();                                                           \
+      if (tid < s) {                                                        \
+        local[0][tid] += local[0][tid+s];                                   \
+        local[1][tid] += local[1][tid+s];                                   \
+      }                                                                     \
+    }                                                                       \
+    if (tid==0) {                                                           \
+      one=local[0][tid];                                                    \
+      two=local[1][tid];                                                    \
+    }                                                                       \
+  }
+
+#define block_reduce_arr(trip, width, local, tid, arr)                      \
+  for (int r=0; r<trip; r++)                                                \
+    local[r][tid]=arr[r];                                                   \
+  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
+    __syncthreads();                                                        \
+    if (tid < s) {                                                          \
+      for (int r=0; r<trip; r++)                                            \
+        local[r][tid] += local[r][tid+s];                                   \
+    }                                                                       \
+  }                                                                         \
+  if (tid<width) {                                                          \
+    for (unsigned int s=width/2; s>0; s>>=1) {                              \
+      simdsync();                                                           \
+      if (tid < s) {                                                        \
+        for (int r=0; r<trip; r++)                                          \
+          local[r][tid] += local[r][tid+s];                                 \
+      }                                                                     \
+    }                                                                       \
+    if (tid==0) {                                                           \
+      for (int r=0; r<trip; r++)                                            \
+        arr[r]=local[r][tid];                                               \
+    }                                                                       \
+  }
+
+#define local_allocate_store_pair()                                         \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+#define local_allocate_store_charge()                                       \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+#define local_allocate_store_bio()                                          \
+    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+#define local_allocate_store_ellipse()                                      \
+    __local acctyp red_acc[6][BLOCK_ELLIPSE];
+#define local_allocate_store_three()                                        \
+    __local acctyp red_acc[6][BLOCK_ELLIPSE];
+
+#define store_answers(f, energy, virial, ii, inum, tid,                     \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int ev_stride=NUM_BLOCKS_X;                                     \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
      }                                                                     \
    }                                                                       \
-    ans[ii]=f;                                                              \
  }

 #define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    red_acc[4][tid]=e_coul;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<5; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    e_coul=red_acc[4][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
    }                                                                       \
  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-      engv[ei]=e_coul*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    const int ev_stride=NUM_BLOCKS_X;                                       \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+          engv[ei]=e_coul*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
    }                                                                       \
-    ans[ii]=f;                                                              \
  }

 #else

-#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
+#define simd_reduce_add1(width, one)                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) one += shfl_down(one, s, width);
+
+#define simd_reduce_add2(width, one, two)                                   \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    one += shfl_down(one, s, width);                                        \
+    two += shfl_down(two, s, width);                                        \
+  }
+
+#define simd_reduce_add3(width, one, two, three)                            \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    one += shfl_down(one, s, width);                                        \
+    two += shfl_down(two, s, width);                                        \
+    three += shfl_down(three, s, width);                                    \
+  }
+
+#define simd_reduce_add6(width, one, two, three, four, five, six)           \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    one += shfl_down(one, s, width);                                        \
+    two += shfl_down(two, s, width);                                        \
+    three += shfl_down(three, s, width);                                    \
+    four += shfl_down(four, s, width);                                      \
+    five += shfl_down(five, s, width);                                      \
+    six += shfl_down(six, s, width);                                        \
+  }
+
+#define simd_reduce_arr(trip, width, arr)                                   \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    for (int r=0; r<trip; r++)                                              \
+      arr[r] += shfl_down(arr[r], s, width);                                \
+  }
+
+#if (EVFLAG == 1)
+
+#define local_allocate_store_pair()                                         \
+    __local acctyp red_acc[7][BLOCK_PAIR / SIMD_SIZE];
+#define local_allocate_store_charge()                                       \
+    __local acctyp red_acc[8][BLOCK_PAIR / SIMD_SIZE];
+#define local_allocate_store_bio()                                          \
+    __local acctyp red_acc[8][BLOCK_BIO_PAIR / SIMD_SIZE];
+#define local_allocate_store_ellipse()
+#define local_allocate_store_three()                                        \
+    __local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
+
+#define store_answers(f, energy, virial, ii, inum, tid,                     \
+                      t_per_atom, offset, eflag, vflag, ans, engv)          \
  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
    }                                                                       \
  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
    }                                                                       \
-    ans[ii]=f;                                                              \
  }

 #define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      f.x += shfl_xor(f.x, s, t_per_atom);                                  \
-      f.y += shfl_xor(f.y, s, t_per_atom);                                  \
-      f.z += shfl_xor(f.z, s, t_per_atom);                                  \
-      energy += shfl_xor(energy, s, t_per_atom);                            \
-      e_coul += shfl_xor(e_coul, s, t_per_atom);                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
    }                                                                       \
  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-      engv[ei]=e_coul*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+            engv[ei]=e_coul*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
    }                                                                       \
-    ans[ii]=f;                                                              \
  }

+#else
+
+#define local_allocate_store_pair()
+#define local_allocate_store_charge()
+#define local_allocate_store_bio()
+#define local_allocate_store_ellipse()
+#define local_allocate_store_three()
+
+#define store_answers(f, energy, virial, ii, inum, tid,                     \
+                      t_per_atom, offset, eflag, vflag, ans, engv)          \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) ans[ii]=f;
+
+#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) ans[ii]=f;
+
+#endif
+
 #endif