// **************************************************************************
//                                 aux_fun1.h
//                             -------------------
//                           W. Michael Brown (ORNL)
//
//  Device code for pair style auxiliary functions
//
// __________________________________________________________________________
//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
//    begin                : Sat Oct 22 2011
//    email                : brownw@ornl.gov
// ***************************************************************************/

#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#endif

#define atom_info(t_per_atom, ii, tid, offset)                               \
  tid=THREAD_ID_X;                                                           \
  offset=tid & (t_per_atom-1);                                               \
  ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;

#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset,  \
                  i, numj, n_stride, nbor_end, nbor_begin)                   \
  i=dev_nbor[ii];                                                            \
  nbor_begin=ii+nbor_pitch;                                                  \
  numj=dev_nbor[nbor_begin];                                                 \
  if (dev_nbor==dev_packed) {                                                \
    nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1);                        \
    n_stride=fast_mul(t_per_atom,nbor_pitch);                                \
    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
    nbor_begin+=offset;                                                      \
  } else {                                                                   \
    nbor_begin+=nbor_pitch;                                                  \
    nbor_begin=dev_nbor[nbor_begin];                                         \
    nbor_end=nbor_begin+numj;                                                \
    n_stride=t_per_atom;                                                     \
    nbor_begin+=offset;                                                      \
  }

#define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
                    i, numj, stride, nbor_end, nbor_begin)                   \
    i=nbor_mem[ii];                                                          \
    nbor_begin=ii+nbor_stride;                                               \
    numj=nbor_mem[nbor_begin];                                               \
    nbor_begin+=nbor_stride+ii*(t_per_atom-1);                               \
    stride=fast_mul(t_per_atom,nbor_stride);                                 \
    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj &             \
                                                          (t_per_atom-1));   \
    nbor_begin+=offset;

#if (SHUFFLE_AVAIL == 0)

#define simd_reduce_add1(width, local, offset, tid, one)                    \
  local[0][tid]=one;                                                        \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    simdsync();                                                             \
    if (offset < s) local[0][tid] += local[0][tid+s];                       \
  }                                                                         \
  if (offset==0) one=local[0][tid];

#define simd_reduce_add2(width, local, offset, tid, one, two)               \
  local[0][tid]=one;                                                        \
  local[1][tid]=two;                                                        \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    simdsync();                                                             \
    if (offset < s) {                                                       \
      local[0][tid] += local[0][tid+s];                                     \
      local[1][tid] += local[1][tid+s];                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
    one=local[0][tid];                                                      \
    two=local[1][tid];                                                      \
  }

#define simd_reduce_add3(width, local, offset, tid, one, two, three)        \
  local[0][tid]=one;                                                        \
  local[1][tid]=two;                                                        \
  local[2][tid]=three;                                                      \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    simdsync();                                                             \
    if (offset < s) {                                                       \
      local[0][tid] += local[0][tid+s];                                     \
      local[1][tid] += local[1][tid+s];                                     \
      local[2][tid] += local[2][tid+s];                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
    one=local[0][tid];                                                      \
    two=local[1][tid];                                                      \
    three=local[2][tid];                                                    \
  }

#define simd_reduce_add6(width, local, offset, tid, one, two, three,        \
                         four, five, six)                                   \
  local[0][tid]=one;                                                        \
  local[1][tid]=two;                                                        \
  local[2][tid]=three;                                                      \
  local[3][tid]=four;                                                       \
  local[4][tid]=five;                                                       \
  local[5][tid]=six;                                                        \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    simdsync();                                                             \
    if (offset < s) {                                                       \
      local[0][tid] += local[0][tid+s];                                     \
      local[1][tid] += local[1][tid+s];                                     \
      local[2][tid] += local[2][tid+s];                                     \
      local[3][tid] += local[3][tid+s];                                     \
      local[4][tid] += local[4][tid+s];                                     \
      local[5][tid] += local[5][tid+s];                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
    one=local[0][tid];                                                      \
    two=local[1][tid];                                                      \
    three=local[2][tid];                                                    \
    four=local[3][tid];                                                     \
    five=local[4][tid];                                                     \
    six=local[5][tid];                                                      \
  }

#define simd_reduce_arr(trip, width, local, offset, tid, arr)               \
  for (int r=0; r<trip; r++)                                                \
    local[r][tid]=arr[r];                                                   \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    simdsync();                                                             \
    if (offset < s) {                                                       \
      for (int r=0; r<trip; r++)                                            \
        local[r][tid] += local[r][tid+s];                                   \
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
    for (int r=0; r<trip; r++)                                              \
      arr[r]=local[r][tid];                                                 \
  }

#define block_reduce_add1(width, local, tid, one)                           \
  local[0][tid]=one;                                                        \
  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
    __syncthreads();                                                        \
    if (tid < s) local[0][tid] += local[0][tid+s];                          \
  }                                                                         \
  if (tid<width) {                                                          \
    for (unsigned int s=width/2; s>0; s>>=1) {                              \
      simdsync();                                                           \
      if (tid < s) local[0][tid] += local[0][tid+s];                        \
    }                                                                       \
    if (tid==0) one=local[0][tid];                                          \
  }

#define block_reduce_add2(width, local, tid, one, two)                      \
  local[0][tid]=one;                                                        \
  local[1][tid]=two;                                                        \
  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
    __syncthreads();                                                        \
    if (tid < s) {                                                          \
      local[0][tid] += local[0][tid+s];                                     \
      local[1][tid] += local[1][tid+s];                                     \
    }                                                                       \
  }                                                                         \
  if (tid<width) {                                                          \
    for (unsigned int s=width/2; s>0; s>>=1) {                              \
      simdsync();                                                           \
      if (tid < s) {                                                        \
        local[0][tid] += local[0][tid+s];                                   \
        local[1][tid] += local[1][tid+s];                                   \
      }                                                                     \
    }                                                                       \
    if (tid==0) {                                                           \
      one=local[0][tid];                                                    \
      two=local[1][tid];                                                    \
    }                                                                       \
  }

#define block_reduce_arr(trip, width, local, tid, arr)                      \
  for (int r=0; r<trip; r++)                                                \
    local[r][tid]=arr[r];                                                   \
  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
    __syncthreads();                                                        \
    if (tid < s) {                                                          \
      for (int r=0; r<trip; r++)                                            \
        local[r][tid] += local[r][tid+s];                                   \
    }                                                                       \
  }                                                                         \
  if (tid<width) {                                                          \
    for (unsigned int s=width/2; s>0; s>>=1) {                              \
      simdsync();                                                           \
      if (tid < s) {                                                        \
        for (int r=0; r<trip; r++)                                          \
          local[r][tid] += local[r][tid+s];                                 \
      }                                                                     \
    }                                                                       \
    if (tid==0) {                                                           \
      for (int r=0; r<trip; r++)                                            \
        arr[r]=local[r][tid];                                               \
    }                                                                       \
  }

#define local_allocate_store_pair()                                         \
    __local acctyp red_acc[6][BLOCK_PAIR];
#define local_allocate_store_charge()                                       \
    __local acctyp red_acc[6][BLOCK_PAIR];
#define local_allocate_store_bio()                                          \
    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
#define local_allocate_store_ellipse()                                      \
    __local acctyp red_acc[6][BLOCK_ELLIPSE];
#define local_allocate_store_three()                                        \
    __local acctyp red_acc[6][BLOCK_ELLIPSE];

#define store_answers(f, energy, virial, ii, inum, tid,                     \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
      if (eflag) {                                                          \
        simdsync();                                                         \
        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
      }                                                                     \
      if (vflag) {                                                          \
        simdsync();                                                         \
        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
      }                                                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) ans[ii]=f;                                      \
  if (EVFLAG && (eflag || vflag)) {                                         \
    int ei=BLOCK_ID_X;                                                      \
    if (eflag!=2 && vflag!=2) {                                             \
      const int ev_stride=NUM_BLOCKS_X;                                     \
      if (eflag) {                                                          \
        simdsync();                                                         \
        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
        if (vflag) __syncthreads();                                         \
        if (tid==0) {                                                       \
          engv[ei]=energy*(acctyp)0.5;                                      \
          ei+=ev_stride;                                                    \
        }                                                                   \
      }                                                                     \
      if (vflag) {                                                          \
        simdsync();                                                         \
        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
        if (tid==0) {                                                       \
          for (int r=0; r<6; r++) {                                         \
            engv[ei]=virial[r]*(acctyp)0.5;                                 \
            ei+=ev_stride;                                                  \
          }                                                                 \
        }                                                                   \
      }                                                                     \
    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
      if (EVFLAG && eflag) {                                                \
        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
      if (EVFLAG && vflag) {                                                \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]=virial[i]*(acctyp)0.5;                                   \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
    }                                                                       \
  }

#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
      if (eflag) {                                                          \
        simdsync();                                                         \
        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
      }                                                                     \
      if (vflag) {                                                          \
        simdsync();                                                         \
        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
      }                                                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) ans[ii]=f;                                      \
  if (EVFLAG && (eflag || vflag)) {                                         \
    int ei=BLOCK_ID_X;                                                      \
    const int ev_stride=NUM_BLOCKS_X;                                       \
    if (eflag!=2 && vflag!=2) {                                             \
      if (eflag) {                                                          \
        simdsync();                                                         \
        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
        if (vflag) __syncthreads();                                         \
        if (tid==0) {                                                       \
          engv[ei]=energy*(acctyp)0.5;                                      \
          ei+=ev_stride;                                                    \
          engv[ei]=e_coul*(acctyp)0.5;                                      \
          ei+=ev_stride;                                                    \
        }                                                                   \
      }                                                                     \
      if (vflag) {                                                          \
        simdsync();                                                         \
        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
        if (tid==0) {                                                       \
          for (int r=0; r<6; r++) {                                         \
            engv[ei]=virial[r]*(acctyp)0.5;                                 \
            ei+=ev_stride;                                                  \
          }                                                                 \
        }                                                                   \
      }                                                                     \
    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
      if (EVFLAG && eflag) {                                                \
        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
        engv[ei]=e_coul*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
      if (EVFLAG && vflag) {                                                \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]=virial[i]*(acctyp)0.5;                                   \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
    }                                                                       \
  }

#else

#define simd_reduce_add1(width, one)                                        \
  for (unsigned int s=width/2; s>0; s>>=1) one += shfl_down(one, s, width);

#define simd_reduce_add2(width, one, two)                                   \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    one += shfl_down(one, s, width);                                        \
    two += shfl_down(two, s, width);                                        \
  }

#define simd_reduce_add3(width, one, two, three)                            \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    one += shfl_down(one, s, width);                                        \
    two += shfl_down(two, s, width);                                        \
    three += shfl_down(three, s, width);                                    \
  }

#define simd_reduce_add6(width, one, two, three, four, five, six)           \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    one += shfl_down(one, s, width);                                        \
    two += shfl_down(two, s, width);                                        \
    three += shfl_down(three, s, width);                                    \
    four += shfl_down(four, s, width);                                      \
    five += shfl_down(five, s, width);                                      \
    six += shfl_down(six, s, width);                                        \
  }

#define simd_reduce_arr(trip, width, arr)                                   \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    for (int r=0; r<trip; r++)                                              \
      arr[r] += shfl_down(arr[r], s, width);                                \
  }

#if (EVFLAG == 1)

#define local_allocate_store_pair()                                         \
    __local acctyp red_acc[7][BLOCK_PAIR / SIMD_SIZE];
#define local_allocate_store_charge()                                       \
    __local acctyp red_acc[8][BLOCK_PAIR / SIMD_SIZE];
#define local_allocate_store_bio()                                          \
    __local acctyp red_acc[8][BLOCK_BIO_PAIR / SIMD_SIZE];
#define local_allocate_store_ellipse()
#define local_allocate_store_three()                                        \
    __local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];

#define store_answers(f, energy, virial, ii, inum, tid,                     \
                      t_per_atom, offset, eflag, vflag, ans, engv)          \
  if (t_per_atom>1) {                                                       \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
    if (vflag==2 || eflag==2) {                                             \
      if (eflag)                                                            \
        simd_reduce_add1(t_per_atom,energy);                                \
      if (vflag)                                                            \
        simd_reduce_arr(6, t_per_atom,virial);                              \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) ans[ii]=f;                                      \
  if (eflag || vflag) {                                                     \
    if (eflag!=2 && vflag!=2) {                                             \
      const int vwidth = simd_size();                                       \
      const int voffset = tid & (simd_size() - 1);                          \
      const int bnum = tid/simd_size();                                     \
      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
        if (bnum < active_subgs) {                                          \
          if (eflag) {                                                      \
            simd_reduce_add1(vwidth, energy);                               \
            if (voffset==0) red_acc[6][bnum] = energy;                      \
          }                                                                 \
          if (vflag) {                                                      \
            simd_reduce_arr(6, vwidth, virial);                             \
            if (voffset==0)                                                 \
              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
          }                                                                 \
        }                                                                   \
                                                                            \
        __syncthreads();                                                    \
        if (tid < active_subgs) {                                           \
            if (eflag) energy = red_acc[6][tid];                            \
          if (vflag)                                                        \
            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
        } else {                                                            \
          if (eflag) energy = (acctyp)0;                                    \
          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
        }                                                                   \
      }                                                                     \
                                                                            \
      if (bnum == 0) {                                                      \
        int ei=BLOCK_ID_X;                                                  \
        const int ev_stride=NUM_BLOCKS_X;                                   \
        if (eflag) {                                                        \
          simd_reduce_add1(vwidth, energy);                                 \
          if (tid==0) {                                                     \
            engv[ei]=energy*(acctyp)0.5;                                    \
            ei+=ev_stride;                                                  \
          }                                                                 \
        }                                                                   \
        if (vflag) {                                                        \
          simd_reduce_arr(6, vwidth, virial);                               \
          if (tid==0) {                                                     \
            for (int r=0; r<6; r++) {                                       \
              engv[ei]=virial[r]*(acctyp)0.5;                               \
              ei+=ev_stride;                                                \
            }                                                               \
          }                                                                 \
        }                                                                   \
      }                                                                     \
    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
      if (eflag) {                                                          \
        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
      if (vflag) {                                                          \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]=virial[i]*(acctyp)0.5;                                   \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
    }                                                                       \
  }

#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
    if (vflag==2 || eflag==2) {                                             \
      if (eflag)                                                            \
        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
      if (vflag)                                                            \
        simd_reduce_arr(6, t_per_atom,virial);                              \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) ans[ii]=f;                                      \
  if (eflag || vflag) {                                                     \
    if (eflag!=2 && vflag!=2) {                                             \
      const int vwidth = simd_size();                                       \
      const int voffset = tid & (simd_size() - 1);                          \
      const int bnum = tid/simd_size();                                     \
      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
        if (bnum < active_subgs) {                                          \
          if (eflag) {                                                      \
            simd_reduce_add2(vwidth, energy, e_coul);                       \
            if (voffset==0) {                                               \
              red_acc[6][bnum] = energy;                                    \
              red_acc[7][bnum] = e_coul;                                    \
            }                                                               \
          }                                                                 \
          if (vflag) {                                                      \
            simd_reduce_arr(6, vwidth, virial);                             \
            if (voffset==0)                                                 \
              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
          }                                                                 \
        }                                                                   \
                                                                            \
        __syncthreads();                                                    \
        if (tid < active_subgs) {                                           \
          if (eflag) {                                                      \
            energy = red_acc[6][tid];                                       \
            e_coul = red_acc[7][tid];                                       \
          }                                                                 \
          if (vflag)                                                        \
            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
        } else {                                                            \
          if (eflag) energy = e_coul = (acctyp)0;                           \
          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
        }                                                                   \
      }                                                                     \
                                                                            \
      if (bnum == 0) {                                                      \
        int ei=BLOCK_ID_X;                                                  \
        const int ev_stride=NUM_BLOCKS_X;                                   \
        if (eflag) {                                                        \
          simd_reduce_add2(vwidth, energy, e_coul);                         \
          if (tid==0) {                                                     \
            engv[ei]=energy*(acctyp)0.5;                                    \
            ei+=ev_stride;                                                  \
            engv[ei]=e_coul*(acctyp)0.5;                                    \
            ei+=ev_stride;                                                  \
          }                                                                 \
        }                                                                   \
        if (vflag) {                                                        \
          simd_reduce_arr(6, vwidth, virial);                               \
          if (tid==0) {                                                     \
            for (int r=0; r<6; r++) {                                       \
              engv[ei]=virial[r]*(acctyp)0.5;                               \
              ei+=ev_stride;                                                \
            }                                                               \
          }                                                                 \
        }                                                                   \
      }                                                                     \
    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
      if (eflag) {                                                          \
        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
        engv[ei]=e_coul*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
      if (vflag) {                                                          \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]=virial[i]*(acctyp)0.5;                                   \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
    }                                                                       \
  }

#else

#define local_allocate_store_pair()
#define local_allocate_store_charge()
#define local_allocate_store_bio()
#define local_allocate_store_ellipse()
#define local_allocate_store_three()

#define store_answers(f, energy, virial, ii, inum, tid,                     \
                      t_per_atom, offset, eflag, vflag, ans, engv)          \
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) ans[ii]=f;

#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) ans[ii]=f;

#endif

#endif