lammps/lib/gpu/lal_hippo.cu

// **************************************************************************
//                                   hippo.cu
//                             -------------------
//                          Trung Dac Nguyen (Northwestern)
//
//  Device code for acceleration of the hippo pair style
//
// __________________________________________________________________________
//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
//    begin                :
//    email                : trung.nguyen@northwestern.edu
// ***************************************************************************

#if defined(NV_KERNEL) || defined(USE_HIP)
#include <stdio.h>
#include "lal_hippo_extra.h"
//#include "lal_aux_fun1.h"
#ifdef LAMMPS_SMALLBIG
#define tagint int
#endif
#ifdef LAMMPS_BIGBIG
#include "inttypes.h"
#define tagint int64_t
#endif
#ifdef LAMMPS_SMALLSMALL
#define tagint int
#endif
#ifndef _DOUBLE_DOUBLE
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif

#else
#define pos_tex x_
#define q_tex q_
#ifdef LAMMPS_SMALLBIG
#define tagint int
#endif
#ifdef LAMMPS_BIGBIG
#define tagint long
#endif
#ifdef LAMMPS_SMALLSMALL
#define tagint int
#endif

#endif // defined(NV_KERNEL) || defined(USE_HIP)


#if (SHUFFLE_AVAIL == 0)

#define local_allocate_store_ufld()                                         \
    __local acctyp red_acc[6][BLOCK_PAIR];

#define store_answers_hippo_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
                                tep)                                        \
  if (t_per_atom>1) {                                                       \
    red_acc[0][tid]=tq.x;                                                   \
    red_acc[1][tid]=tq.y;                                                   \
    red_acc[2][tid]=tq.z;                                                   \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      simdsync();                                                           \
      if (offset < s) {                                                     \
        for (int r=0; r<3; r++)                                             \
          red_acc[r][tid] += red_acc[r][tid+s];                             \
      }                                                                     \
    }                                                                       \
    tq.x=red_acc[0][tid];                                                   \
    tq.y=red_acc[1][tid];                                                   \
    tq.z=red_acc[2][tid];                                                   \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
    tep[i]=tq;                                                              \
  }

#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
                          i, tep)                                           \
  if (t_per_atom>1) {                                                       \
    red_acc[0][tid]=ufld[0];                                                \
    red_acc[1][tid]=ufld[1];                                                \
    red_acc[2][tid]=ufld[2];                                                \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      simdsync();                                                           \
      if (offset < s) {                                                     \
        for (int r=0; r<3; r++)                                             \
          red_acc[r][tid] += red_acc[r][tid+s];                             \
      }                                                                     \
    }                                                                       \
    ufld[0]=red_acc[0][tid];                                                \
    ufld[1]=red_acc[1][tid];                                                \
    ufld[2]=red_acc[2][tid];                                                \
    red_acc[0][tid]=dufld[0];                                               \
    red_acc[1][tid]=dufld[1];                                               \
    red_acc[2][tid]=dufld[2];                                               \
    red_acc[3][tid]=dufld[3];                                               \
    red_acc[4][tid]=dufld[4];                                               \
    red_acc[5][tid]=dufld[5];                                               \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      simdsync();                                                           \
      if (offset < s) {                                                     \
        for (int r=0; r<6; r++)                                             \
          red_acc[r][tid] += red_acc[r][tid+s];                             \
      }                                                                     \
    }                                                                       \
    dufld[0]=red_acc[0][tid];                                               \
    dufld[1]=red_acc[1][tid];                                               \
    dufld[2]=red_acc[2][tid];                                               \
    dufld[3]=red_acc[3][tid];                                               \
    dufld[4]=red_acc[4][tid];                                               \
    dufld[5]=red_acc[5][tid];                                               \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
    acctyp4 t;                                                              \
    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
    tep[i]=t;                                                               \
  }

#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i,  \
                              fieldp)                                       \
  if (t_per_atom>1) {                                                       \
    red_acc[0][tid]=_fieldp[0];                                             \
    red_acc[1][tid]=_fieldp[1];                                             \
    red_acc[2][tid]=_fieldp[2];                                             \
    red_acc[3][tid]=_fieldp[3];                                             \
    red_acc[4][tid]=_fieldp[4];                                             \
    red_acc[5][tid]=_fieldp[5];                                             \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      simdsync();                                                           \
      if (offset < s) {                                                     \
        for (int r=0; r<6; r++)                                             \
          red_acc[r][tid] += red_acc[r][tid+s];                             \
      }                                                                     \
    }                                                                       \
    _fieldp[0]=red_acc[0][tid];                                             \
    _fieldp[1]=red_acc[1][tid];                                             \
    _fieldp[2]=red_acc[2][tid];                                             \
    _fieldp[3]=red_acc[3][tid];                                             \
    _fieldp[4]=red_acc[4][tid];                                             \
    _fieldp[5]=red_acc[5][tid];                                             \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
    acctyp4 f, fp;                                                          \
    f.x = _fieldp[0];                                                       \
    f.y = _fieldp[1];                                                       \
    f.z = _fieldp[2];                                                       \
    fieldp[ii] = f;                                                         \
    fp.x = _fieldp[3];                                                      \
    fp.y = _fieldp[4];                                                      \
    fp.z = _fieldp[5];                                                      \
    fieldp[ii+inum] = fp;                                                   \
  }

#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom  \
                        offset, eflag, vflag, ans, engv, ev_stride)         \
  if (t_per_atom>1) {                                                       \
    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
      if (eflag) {                                                          \
        simdsync();                                                         \
        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
      }                                                                     \
      if (vflag) {                                                          \
        simdsync();                                                         \
        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
      }                                                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
    acctyp4 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
    ans[ii]=old;                                                            \
  }                                                                         \
  if (EVFLAG && (eflag || vflag)) {                                         \
    int ei=BLOCK_ID_X;                                                      \
    if (eflag!=2 && vflag!=2) {                                             \
      if (eflag) {                                                          \
        simdsync();                                                         \
        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
        if (vflag) __syncthreads();                                         \
        if (tid==0) {                                                       \
          engv[ei]+=energy*(acctyp)0.5;                                     \
          ei+=ev_stride;                                                    \
          engv[ei]+=e_coul*(acctyp)0.5;                                     \
          ei+=ev_stride;                                                    \
        }                                                                   \
      }                                                                     \
      if (vflag) {                                                          \
        simdsync();                                                         \
        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
        if (tid==0) {                                                       \
          for (int r=0; r<6; r++) {                                         \
            engv[ei]+=virial[r]*(acctyp)0.5;                                \
            ei+=ev_stride;                                                  \
          }                                                                 \
        }                                                                   \
      }                                                                     \
    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
      if (EVFLAG && eflag) {                                                \
        engv[ei]+=energy*(acctyp)0.5;                                       \
        ei+=inum;                                                           \
        engv[ei]+=e_coul*(acctyp)0.5;                                       \
        ei+=inum;                                                           \
      }                                                                     \
      if (EVFLAG && vflag) {                                                \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
    }                                                                       \
  }

#else // SHUFFLE_AVAIL == 1

#define local_allocate_store_ufld()

#define store_answers_hippo_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
                          tep)                                              \
  if (t_per_atom>1) {                                                       \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      tq.x += shfl_down(tq.x, s, t_per_atom);                               \
      tq.y += shfl_down(tq.y, s, t_per_atom);                               \
      tq.z += shfl_down(tq.z, s, t_per_atom);                               \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
    tep[i]=tq;                                                              \
  }

#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
                          i, tep)                                           \
  if (t_per_atom>1) {                                                       \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      ufld[0] += shfl_down(ufld[0], s, t_per_atom);                         \
      ufld[1] += shfl_down(ufld[1], s, t_per_atom);                         \
      ufld[2] += shfl_down(ufld[2], s, t_per_atom);                         \
      dufld[0] += shfl_down(dufld[0], s, t_per_atom);                       \
      dufld[1] += shfl_down(dufld[1], s, t_per_atom);                       \
      dufld[2] += shfl_down(dufld[2], s, t_per_atom);                       \
      dufld[3] += shfl_down(dufld[3], s, t_per_atom);                       \
      dufld[4] += shfl_down(dufld[4], s, t_per_atom);                       \
      dufld[5] += shfl_down(dufld[5], s, t_per_atom);                       \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
    acctyp4 t;                                                              \
    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
    tep[i]=t;                                                               \
  }

#define store_answers_fieldp(_fieldp, ii, inum, tid, t_per_atom, offset, i, \
                             fieldp)                                        \
  if (t_per_atom>1) {                                                       \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom);                   \
      _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom);                   \
      _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom);                   \
      _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom);                   \
      _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom);                   \
      _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom);                   \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
    acctyp4 f, fp;                                                          \
    f.x = _fieldp[0];                                                       \
    f.y = _fieldp[1];                                                       \
    f.z = _fieldp[2];                                                       \
    fieldp[ii] = f;                                                         \
    fp.x = _fieldp[3];                                                      \
    fp.y = _fieldp[4];                                                      \
    fp.z = _fieldp[5];                                                      \
    fieldp[ii+inum] = fp;                                                   \
  }

#if (EVFLAG == 1)

#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                        offset, eflag, vflag, ans, engv, ev_stride)         \
  if (t_per_atom>1) {                                                       \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
    if (vflag==2 || eflag==2) {                                             \
      if (eflag)                                                            \
        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
      if (vflag)                                                            \
        simd_reduce_arr(6, t_per_atom,virial);                              \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
    acctyp4 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
    ans[ii]=old;                                                            \
  }                                                                         \
  if (eflag || vflag) {                                                     \
    if (eflag!=2 && vflag!=2) {                                             \
      const int vwidth = simd_size();                                       \
      const int voffset = tid & (simd_size() - 1);                          \
      const int bnum = tid/simd_size();                                     \
      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
        if (bnum < active_subgs) {                                          \
          if (eflag) {                                                      \
            simd_reduce_add2(vwidth, energy, e_coul);                       \
            if (voffset==0) {                                               \
              red_acc[6][bnum] = energy;                                    \
              red_acc[7][bnum] = e_coul;                                    \
            }                                                               \
          }                                                                 \
          if (vflag) {                                                      \
            simd_reduce_arr(6, vwidth, virial);                             \
            if (voffset==0)                                                 \
              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
          }                                                                 \
        }                                                                   \
                                                                            \
        __syncthreads();                                                    \
        if (tid < active_subgs) {                                           \
          if (eflag) {                                                      \
            energy = red_acc[6][tid];                                       \
            e_coul = red_acc[7][tid];                                       \
          }                                                                 \
          if (vflag)                                                        \
            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
        } else {                                                            \
          if (eflag) energy = e_coul = (acctyp)0;                           \
          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
        }                                                                   \
      }                                                                     \
                                                                            \
      if (bnum == 0) {                                                      \
        int ei=BLOCK_ID_X;                                                  \
        if (eflag) {                                                        \
          simd_reduce_add2(vwidth, energy, e_coul);                         \
          if (tid==0) {                                                     \
            engv[ei]+=energy*(acctyp)0.5;                                   \
            ei+=ev_stride;                                                  \
            engv[ei]+=e_coul*(acctyp)0.5;                                   \
            ei+=ev_stride;                                                  \
          }                                                                 \
        }                                                                   \
        if (vflag) {                                                        \
          simd_reduce_arr(6, vwidth, virial);                               \
          if (tid==0) {                                                     \
            for (int r=0; r<6; r++) {                                       \
              engv[ei]+=virial[r]*(acctyp)0.5;                              \
              ei+=ev_stride;                                                \
            }                                                               \
          }                                                                 \
        }                                                                   \
      }                                                                     \
    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
      if (eflag) {                                                          \
        engv[ei]+=energy*(acctyp)0.5;                                       \
        ei+=inum;                                                           \
        engv[ei]+=e_coul*(acctyp)0.5;                                       \
        ei+=inum;                                                           \
      }                                                                     \
      if (vflag) {                                                          \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
    }                                                                       \
  }

// EVFLAG == 0
#else

#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                        offset, eflag, vflag, ans, engv, ev_stride)         \
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
    acctyp4 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
    ans[ii]=old;                                                            \
  }

#endif // EVFLAG
#endif // SHUFFLE_AVAIL

#define MIN(A,B) ((A) < (B) ? (A) : (B))
#define MY_PIS (acctyp)1.77245385090551602729

/* ----------------------------------------------------------------------
   repulsion = Pauli repulsion interactions
   adapted from Tinker erepel1b() routine
------------------------------------------------------------------------- */

__kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict extra,
                                 const __global numtyp4 *restrict coeff,
                                 const __global numtyp4 *restrict sp_nonpolar,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
                                 __global acctyp4 *restrict ans,
                                 __global acctyp *restrict engv,
                                 __global acctyp4 *restrict tep,
                                 const int eflag, const int vflag, const int inum,
                                 const int nall, const int nbor_pitch,
                                 const int t_per_atom, const numtyp aewald,
                                 const numtyp off2, const numtyp cut2,
                                 const numtyp c0, const numtyp c1, const numtyp c2,
                                 const numtyp c3, const numtyp c4, const numtyp c5)
{
  int tid, ii, offset, i;
  atom_info(t_per_atom,ii,tid,offset);

  int n_stride;
  local_allocate_store_charge();

  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
    energy=(acctyp)0;
    e_coul=(acctyp)0;
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }

  acctyp4 tq;
  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;

  numtyp4* polar1 = (numtyp4*)(&extra[0]);
  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);

  if (ii<inum) {
    int numj, nbor, nbor_end;
    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    //numtyp qtmp; fetch(qtmp,i,q_tex);
    //int itype=ix.w;

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
      numj = dev_short_nbor[nbor];
      nbor += n_stride;
      nbor_end = nbor+fast_mul(numj,n_stride);
      nbor_mem = dev_short_nbor;
    }

    const numtyp4 pol1i = polar1[i];
    numtyp ci  = pol1i.x;    // rpole[i][0];
    numtyp dix = pol1i.y;    // rpole[i][1];
    numtyp diy = pol1i.z;    // rpole[i][2];
    numtyp diz = pol1i.w;    // rpole[i][3];
    const numtyp4 pol2i = polar2[i];
    numtyp qixx = pol2i.x;   // rpole[i][4];
    numtyp qixy = pol2i.y;   // rpole[i][5];
    numtyp qixz = pol2i.z;   // rpole[i][6];
    numtyp qiyy = pol2i.w;   // rpole[i][8];
    const numtyp4 pol3i = polar3[i];
    numtyp qiyz = pol3i.x;   // rpole[i][9];
    numtyp qizz = pol3i.y;   // rpole[i][12];
    int itype = pol3i.z; // amtype[i];
    numtyp sizi = coeff[itype].x; // sizpr[itype];
    numtyp dmpi = coeff[itype].y; // dmppr[itype];
    numtyp vali = coeff[itype].z; // elepr[itype];

    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int jextra=nbor_mem[nbor];
      int j = jextra & NEIGHMASK15;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;

      // Compute r12
      numtyp xr = ix.x - jx.x;
      numtyp yr = ix.y - jx.y;
      numtyp zr = ix.z - jx.z;
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      if (r2>off2) continue;

      const numtyp4 pol1j = polar1[j];
      numtyp ck  = pol1j.x;  // rpole[j][0];
      numtyp dkx = pol1j.y;  // rpole[j][1];
      numtyp dky = pol1j.z;  // rpole[j][2];
      numtyp dkz = pol1j.w;  // rpole[j][3];
      const numtyp4 pol2j = polar2[j];
      numtyp qkxx = pol2j.x; // rpole[j][4];
      numtyp qkxy = pol2j.y; // rpole[j][5];
      numtyp qkxz = pol2j.z; // rpole[j][6];
      numtyp qkyy = pol2j.w; // rpole[j][8];
      const numtyp4 pol3j = polar3[j];
      numtyp qkyz = pol3j.x; // rpole[j][9];
      numtyp qkzz = pol3j.y; // rpole[j][12];
      int jtype = pol3j.z; // amtype[j];

      numtyp sizk = coeff[jtype].x; // sizpr[jtype];
      numtyp dmpk = coeff[jtype].y; // dmppr[jtype];
      numtyp valk = coeff[jtype].z; // elepr[jtype];

      const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)];
      numtyp factor_repel = sp_nonpol.y; // factor_repel = special_repel[sbmask15(j)];

      // intermediates involving moments and separation distance

      numtyp dir = dix*xr + diy*yr + diz*zr;
      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
      numtyp qir = qix*xr + qiy*yr + qiz*zr;
      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
      numtyp qkr = qkx*xr + qky*yr + qkz*zr;

      numtyp dik = dix*dkx + diy*dky + diz*dkz;
      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
        qixx*qkxx + qiyy*qkyy + qizz*qkzz;

      // additional intermediates involving moments and distance

      numtyp dirx = diy*zr - diz*yr;
      numtyp diry = diz*xr - dix*zr;
      numtyp dirz = dix*yr - diy*xr;
      numtyp dkrx = dky*zr - dkz*yr;
      numtyp dkry = dkz*xr - dkx*zr;
      numtyp dkrz = dkx*yr - dky*xr;
      numtyp dikx = diy*dkz - diz*dky;
      numtyp diky = diz*dkx - dix*dkz;
      numtyp dikz = dix*dky - diy*dkx;
      numtyp qirx = qiz*yr - qiy*zr;
      numtyp qiry = qix*zr - qiz*xr;
      numtyp qirz = qiy*xr - qix*yr;
      numtyp qkrx = qkz*yr - qky*zr;
      numtyp qkry = qkx*zr - qkz*xr;
      numtyp qkrz = qky*xr - qkx*yr;
      numtyp qikx = qky*qiz - qkz*qiy;
      numtyp qiky = qkz*qix - qkx*qiz;
      numtyp qikz = qkx*qiy - qky*qix;
      numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz;
      numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz;
      numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz;
      numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz;
      numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz;
      numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz;
      numtyp qikrx = qizk*yr - qiyk*zr;
      numtyp qikry = qixk*zr - qizk*xr;
      numtyp qikrz = qiyk*xr - qixk*yr;
      numtyp qkirx = qkzi*yr - qkyi*zr;
      numtyp qkiry = qkxi*zr - qkzi*xr;
      numtyp qkirz = qkyi*xr - qkxi*yr;
      numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
      numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
      numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
      numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
      numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
      numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
      numtyp diqkrx = diqkz*yr - diqky*zr;
      numtyp diqkry = diqkx*zr - diqkz*xr;
      numtyp diqkrz = diqky*xr - diqkx*yr;
      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
      numtyp dkqiry = dkqix*zr - dkqiz*xr;
      numtyp dkqirz = dkqiy*xr - dkqix*yr;
      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy -
        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz -
        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix -
        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);

      // get reciprocal distance terms for this interaction

      numtyp r = ucl_sqrt(r2);
      numtyp rinv = ucl_recip(r);
      numtyp r2inv = rinv*rinv;
      numtyp rr1 = rinv;
      numtyp rr3 = rr1 * r2inv;
      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
      numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;

      // get damping coefficients for the Pauli repulsion energy
      numtyp dmpik[11];
      damprep(r,r2,rr1,rr3,rr5,rr7,rr9,rr11,11,dmpi,dmpk,dmpik);

      // calculate intermediate terms needed for the energy

      numtyp term1 = vali*valk;
      numtyp term2 = valk*dir - vali*dkr + dik;
      numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
      numtyp term4 = dir*qkr - dkr*qir - 4.0*qik;
      numtyp term5 = qir*qkr;
      numtyp eterm = term1*dmpik[0] + term2*dmpik[2] +
        term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8];

      // compute the Pauli repulsion energy for this interaction

      numtyp sizik = sizi * sizk * factor_repel;
      numtyp e = sizik * eterm * rr1;

      // calculate intermediate terms for force and torque

      numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] +
        term4*dmpik[8] + term5*dmpik[10];
      term1 = -valk*dmpik[2] + dkr*dmpik[4] - qkr*dmpik[6];
      term2 = vali*dmpik[2] + dir*dmpik[4] + qir*dmpik[6];
      term3 = (numtyp)2.0 * dmpik[4];
      term4 = (numtyp)2.0 * (-valk*dmpik[4] + dkr*dmpik[6] - qkr*dmpik[8]);
      term5 = (numtyp)2.0 * (-vali*dmpik[4] - dir*dmpik[6] - qir*dmpik[8]);
      numtyp term6 = (numtyp)4.0 * dmpik[6];

      // compute the force components for this interaction

      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) +
        term4*qix + term5*qkx + term6*(qixk+qkxi);
      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) +
        term4*qiy + term5*qky + term6*(qiyk+qkyi);
      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
        term4*qiz + term5*qkz + term6*(qizk+qkzi);
      frcx = frcx*rr1 + eterm*rr3*xr;
      frcy = frcy*rr1 + eterm*rr3*yr;
      frcz = frcz*rr1 + eterm*rr3*zr;

      // compute the torque components for this interaction

      numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) -
        term4*qirx - term6*(qikrx+qikx);
      numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) -
        term4*qiry - term6*(qikry+qiky);
      numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) -
        term4*qirz - term6*(qikrz+qikz);
      ttmix = sizik * ttmix * rr1;
      ttmiy = sizik * ttmiy * rr1;
      ttmiz = sizik * ttmiz * rr1;

      // use energy switching if near the cutoff distance

      if (r2 > cut2) {
        numtyp r3 = r2 * r;
        numtyp r4 = r2 * r2;
        numtyp r5 = r2 * r3;
        numtyp taper = c5*r5 + c4*r4 + c3*r3 + c2*r2 + c1*r + c0;
        numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp).0*c4*r3 +
          (numtyp)3.0*c3*r2 + (numtyp)2.0*c2*r + c1;
        dtaper *= e * rr1;
        e *= taper;
        frcx = frcx*taper - dtaper*xr;
        frcy = frcy*taper - dtaper*yr;
        frcz = frcz*taper - dtaper*zr;
        ttmix *= taper;
        ttmiy *= taper;
        ttmiz *= taper;
      }

      energy += e;

      // increment force-based gradient and torque on atom I

      f.x += frcx;
      f.y += frcy;
      f.z += frcz;
      tq.x += ttmix;
      tq.y += ttmiy;
      tq.z += ttmiz;

      // increment the internal virial tensor components
      if (EVFLAG && vflag) {
        numtyp vxx = -xr * frcx;
        numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
        numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
        numtyp vyy = -yr * frcy;
        numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
        numtyp vzz = -zr * frcz;

        virial[0] += vxx;
        virial[1] += vyy;
        virial[2] += vzz;
        virial[3] += vxy;
        virial[4] += vxz;
        virial[5] += vyz;
      }
    } // nbor

  } // ii<inum

  // accumulate tq
  store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
  // accumate force, energy and virial
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
     offset,eflag,vflag,ans,engv);
}

/* ----------------------------------------------------------------------
   dispersion = real-space portion of Ewald dispersion
   adapted from Tinker edreal1d() routine
------------------------------------------------------------------------- */

__kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict extra,
                                 const __global numtyp4 *restrict coeff_amtype,
                                 const __global numtyp4 *restrict coeff_amclass,
                                 const __global numtyp4 *restrict sp_nonpolar,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
                                 __global acctyp4 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag, const int inum,
                                 const int nall, const int nbor_pitch,
                                 const int t_per_atom, const numtyp aewald,
                                 const numtyp off2)
{
  int tid, ii, offset, i;
  atom_info(t_per_atom,ii,tid,offset);

  int n_stride;
  local_allocate_store_charge();

  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
    energy=(acctyp)0;
    e_coul=(acctyp)0;
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }

  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);

  if (ii<inum) {
    int itype,iclass;
    numtyp ci,ai;

    int numj, nbor, nbor_end;
    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    //numtyp qtmp; fetch(qtmp,i,q_tex);
    //int itype=ix.w;

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
      numj = dev_short_nbor[nbor];
      nbor += n_stride;
      nbor_end = nbor+fast_mul(numj,n_stride);
      nbor_mem = dev_short_nbor;
    }

    itype  = polar3[i].z;            // amtype[i];
    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
    ci = coeff_amclass[iclass].x;    // csix[iclass];
    ai = coeff_amclass[iclass].y;    // adisp[iclass];

    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int jextra=nbor_mem[nbor];
      int j = jextra & NEIGHMASK15;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;

      // Compute r12
      numtyp xr = ix.x - jx.x;
      numtyp yr = ix.y - jx.y;
      numtyp zr = ix.z - jx.z;
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;

      int jtype =   polar3[j].z; // amtype[j];
      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
      numtyp ck = coeff_amclass[jclass].x;    // csix[jclass];
      numtyp ak = coeff_amclass[jclass].y;    // adisp[jclass];

      numtyp r6 = r2*r2*r2;
      numtyp ralpha2 = r2 * aewald*aewald;
      numtyp term = (numtyp)1.0 + ralpha2 + (numtyp)0.5*ralpha2*ralpha2;
      numtyp expterm = ucl_exp(-ralpha2);
      numtyp expa = expterm * term;

      // find the damping factor for the dispersion interaction

      numtyp r = ucl_sqrt(r2);
      numtyp r7 = r6 * r;
      numtyp di = ai * r;
      numtyp di2 = di * di;
      numtyp di3 = di * di2;
      numtyp dk = ak * r;
      numtyp expi = ucl_exp(-di);
      numtyp expk = ucl_exp(-dk);

      numtyp ai2,ak2;
      numtyp di4,di5;
      numtyp dk2,dk3;
      numtyp ti,ti2;
      numtyp tk,tk2;
      numtyp damp3,damp5;
      numtyp ddamp;
      const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)];
      numtyp factor_disp = sp_nonpol.z; // factor_disp = special_disp[sbmask15(j)];

      if (ai != ak) {
        ai2 = ai * ai;
        ak2 = ak * ak;
        dk2 = dk * dk;
        dk3 = dk * dk2;
        ti = ak2 / (ak2-ai2);
        ti2 = ti * ti;
        tk = ai2 / (ai2-ak2);
        tk2 = tk * tk;
        damp3 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2) * expi
          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2) * expk
          - (numtyp)2.0*ti2*tk * ((numtyp)1.0 + di)* expi
          - (numtyp)2.0*tk2*ti * ((numtyp)1.0 + dk) *expk;
        damp5 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2 + di3/(numtyp)6.0) * expi
          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk
          - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi
          - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk;
        ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) +
          (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0);

      } else {
        di4 = di2 * di2;
        di5 = di2 * di3;
        damp3 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + (numtyp)7.0*di3/(numtyp)48.0+di4/(numtyp)48.0)*expi;
        damp5 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + di3/(numtyp)6.0+di4/(numtyp)24.0+di5/(numtyp)144.0)*expi;
        ddamp = ai * expi * (di5-(numtyp)3.0*di3-(numtyp)3.0*di2) / (numtyp)96.0;
      }

      numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3;

      // apply damping and scaling factors for this interaction

      numtyp scale = factor_disp * damp*damp;
      scale = scale - (numtyp)1.0;
      numtyp e = -ci * ck * (expa+scale) / r6;
      numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r;
      numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7;

      energy+= e;

      // increment the damped dispersion derivative components

      numtyp dedx = de * xr;
      numtyp dedy = de * yr;
      numtyp dedz = de * zr;
      f.x += dedx;
      f.y += dedy;
      f.z += dedz;

      // increment the internal virial tensor components

      numtyp vxx = xr * dedx;
      numtyp vyx = yr * dedx;
      numtyp vzx = zr * dedx;
      numtyp vyy = yr * dedy;
      numtyp vzy = zr * dedy;
      numtyp vzz = zr * dedz;

      virial[0] += vxx;
      virial[1] += vyy;
      virial[2] += vzz;
      virial[3] += vyx;
      virial[4] += vzx;
      virial[5] += vzy;
    } // nbor

  } // ii<inum

  // accumate force, energy and virial
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
     offset,eflag,vflag,ans,engv);
}

/* ----------------------------------------------------------------------
   multipole_real = real-space portion of multipole
   adapted from Tinker emreal1d() routine
------------------------------------------------------------------------- */

__kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict extra,
                                const __global numtyp4 *restrict coeff_amtype,
                                const __global numtyp4 *restrict coeff_amclass,
                                const __global numtyp4 *restrict sp_polar,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
                                const __global int *dev_short_nbor,
                                __global acctyp4 *restrict ans,
                                __global acctyp *restrict engv,
                                __global acctyp4 *restrict tep,
                                const int eflag, const int vflag, const int inum,
                                const int nall, const int nbor_pitch,
                                const int t_per_atom, const numtyp aewald,
                                const numtyp felec, const numtyp off2,
                                const numtyp polar_dscale, const numtyp polar_uscale)
{
  int tid, ii, offset, i;
  atom_info(t_per_atom,ii,tid,offset);

  int n_stride;
  local_allocate_store_charge();

  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
    energy=(acctyp)0;
    e_coul=(acctyp)0;
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }

  acctyp4 tq;
  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;

  numtyp4* polar1 = (numtyp4*)(&extra[0]);
  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
  numtyp4* polar6 = (numtyp4*)(&extra[20*nall]);

  if (ii<inum) {
    int m;
    int itype,iclass;
    numtyp bfac;
    numtyp term1,term2,term3;
    numtyp term4,term5,term6;
    numtyp bn[6];

    int numj, nbor, nbor_end;
    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    //numtyp qtmp; fetch(qtmp,i,q_tex);
    //int itype=ix.w;

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
      numj = dev_short_nbor[nbor];
      nbor += n_stride;
      nbor_end = nbor+fast_mul(numj,n_stride);
      nbor_mem = dev_short_nbor;
    }

    const numtyp4 pol1i = polar1[i];
    numtyp ci  = pol1i.x;    // rpole[i][0];
    numtyp dix = pol1i.y;    // rpole[i][1];
    numtyp diy = pol1i.z;    // rpole[i][2];
    numtyp diz = pol1i.w;    // rpole[i][3];
    const numtyp4 pol2i = polar2[i];
    numtyp qixx = pol2i.x;   // rpole[i][4];
    numtyp qixy = pol2i.y;   // rpole[i][5];
    numtyp qixz = pol2i.z;   // rpole[i][6];
    numtyp qiyy = pol2i.w;   // rpole[i][8];
    const numtyp4 pol3i = polar3[i];
    numtyp qiyz = pol3i.x;   // rpole[i][9];
    numtyp qizz = pol3i.y;   // rpole[i][12];
    itype  = pol3i.z;        // amtype[i];
    iclass = coeff_amtype[itype].w;  // amtype2class[itype];

    numtyp corei = coeff_amclass[itype].z;  // pcore[iclass];
    numtyp alphai = coeff_amclass[itype].w; // palpha[iclass];
    numtyp vali = polar6[i].x;

    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int jextra=nbor_mem[nbor];
      int j = jextra & NEIGHMASK15;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;

      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
      numtyp zr = jx.z - ix.z;
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;

      numtyp r = ucl_sqrt(r2);
      const numtyp4 pol1j = polar1[j];
      numtyp ck  = pol1j.x;  // rpole[j][0];
      numtyp dkx = pol1j.y;  // rpole[j][1];
      numtyp dky = pol1j.z;  // rpole[j][2];
      numtyp dkz = pol1j.w;  // rpole[j][3];
      const numtyp4 pol2j = polar2[j];
      numtyp qkxx = pol2j.x; // rpole[j][4];
      numtyp qkxy = pol2j.y; // rpole[j][5];
      numtyp qkxz = pol2j.z; // rpole[j][6];
      numtyp qkyy = pol2j.w; // rpole[j][8];
      const numtyp4 pol3j = polar3[j];
      numtyp qkyz = pol3j.x; // rpole[j][9];
      numtyp qkzz = pol3j.y; // rpole[j][12];
      int jtype = pol3j.z; // amtype[j];
      int jgroup =  pol3j.w; // amgroup[j];
      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];

      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
      numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];

      numtyp corek = coeff_amclass[jtype].z;  // pcore[jclass];
      numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
      numtyp valk = polar6[j].x;

      // intermediates involving moments and separation distance

      numtyp dir = dix*xr + diy*yr + diz*zr;
      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
      numtyp qir = qix*xr + qiy*yr + qiz*zr;
      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
      numtyp qkr = qkx*xr + qky*yr + qkz*zr;

      numtyp dik = dix*dkx + diy*dky + diz*dkz;
      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
        qixx*qkxx + qiyy*qkyy + qizz*qkzz;

      // additional intermediates involving moments and distance

      numtyp dirx = diy*zr - diz*yr;
      numtyp diry = diz*xr - dix*zr;
      numtyp dirz = dix*yr - diy*xr;
      numtyp dkrx = dky*zr - dkz*yr;
      numtyp dkry = dkz*xr - dkx*zr;
      numtyp dkrz = dkx*yr - dky*xr;
      numtyp dikx = diy*dkz - diz*dky;
      numtyp diky = diz*dkx - dix*dkz;
      numtyp dikz = dix*dky - diy*dkx;
      numtyp qirx = qiz*yr - qiy*zr;
      numtyp qiry = qix*zr - qiz*xr;
      numtyp qirz = qiy*xr - qix*yr;
      numtyp qkrx = qkz*yr - qky*zr;
      numtyp qkry = qkx*zr - qkz*xr;
      numtyp qkrz = qky*xr - qkx*yr;
      numtyp qikx = qky*qiz - qkz*qiy;
      numtyp qiky = qkz*qix - qkx*qiz;
      numtyp qikz = qkx*qiy - qky*qix;
      numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz;
      numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz;
      numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz;
      numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz;
      numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz;
      numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz;
      numtyp qikrx = qizk*yr - qiyk*zr;
      numtyp qikry = qixk*zr - qizk*xr;
      numtyp qikrz = qiyk*xr - qixk*yr;
      numtyp qkirx = qkzi*yr - qkyi*zr;
      numtyp qkiry = qkxi*zr - qkzi*xr;
      numtyp qkirz = qkyi*xr - qkxi*yr;
      numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
      numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
      numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
      numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
      numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
      numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
      numtyp diqkrx = diqkz*yr - diqky*zr;
      numtyp diqkry = diqkx*zr - diqkz*xr;
      numtyp diqkrz = diqky*xr - diqkx*yr;
      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
      numtyp dkqiry = dkqix*zr - dkqiz*xr;
      numtyp dkqirz = dkqiy*xr - dkqix*yr;
      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy -
        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz -
        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix -
        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);

      // get reciprocal distance terms for this interaction

      numtyp rinv = ucl_recip(r);
      numtyp r2inv = rinv*rinv;
      numtyp rr1 = felec * rinv;
      numtyp rr3 = rr1 * r2inv;
      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
      numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;

      // calculate the real space Ewald error function terms

      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;
      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
      numtyp alsq2n = (numtyp)0.0;
      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);

      for (m = 1; m < 6; m++) {
        bfac = (numtyp) (m+m-1);
        alsq2n = alsq2 * alsq2n;
        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
      }
      for (m = 0; m < 6; m++) bn[m] *= felec;

      term1 = corei*corek;
      numtyp term1i = corek*vali;
      numtyp term2i = corek*dir;
      numtyp term3i = corek*qir;
      numtyp term1k = corei*valk;
      numtyp term2k = -corei*dkr;
      numtyp term3k = corei*qkr;
      numtyp term1ik = vali*valk;
      numtyp term2ik = valk*dir - vali*dkr + dik;
      numtyp term3ik = vali*qkr + valk*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
      numtyp term4ik = dir*qkr - dkr*qir - 4.0*qik;
      numtyp term5ik = qir*qkr;
      numtyp dmpi[9],dmpj[9];
      numtyp dmpij[11];
      damppole(r,11,alphai,alphak,dmpi,dmpj,dmpij);
      numtyp scalek = factor_mpole;
      numtyp rr1i = bn[0] - ((numtyp)1.0-scalek*dmpi[0])*rr1;
      numtyp rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3;
      numtyp rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5;
      numtyp rr7i = bn[3] - ((numtyp)1.0-scalek*dmpi[6])*rr7;
      numtyp rr1k = bn[0] - ((numtyp)1.0-scalek*dmpj[0])*rr1;
      numtyp rr3k = bn[1] - ((numtyp)1.0-scalek*dmpj[2])*rr3;
      numtyp rr5k = bn[2] - ((numtyp)1.0-scalek*dmpj[4])*rr5;
      numtyp rr7k = bn[3] - ((numtyp)1.0-scalek*dmpj[6])*rr7;
      numtyp rr1ik = bn[0] - ((numtyp)1.0-scalek*dmpij[0])*rr1;
      numtyp rr3ik = bn[1] - ((numtyp)1.0-scalek*dmpij[2])*rr3;
      numtyp rr5ik = bn[2] - ((numtyp)1.0-scalek*dmpij[4])*rr5;
      numtyp rr7ik = bn[3] - ((numtyp)1.0-scalek*dmpij[6])*rr7;
      numtyp rr9ik = bn[4] - ((numtyp)1.0-scalek*dmpij[8])*rr9;
      numtyp rr11ik = bn[5] - ((numtyp)1.0-scalek*dmpij[10])*rr11;
      rr1 = bn[0] - ((numtyp)1.0-scalek)*rr1;
      rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3;
      numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik +
        term1i*rr1i + term1k*rr1k + term1ik*rr1ik +
        term2i*rr3i + term2k*rr3k + term2ik*rr3ik +
        term3i*rr5i + term3k*rr5k + term3ik*rr5ik;

      // find damped multipole intermediates for force and torque

      numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik +
        term1i*rr3i + term1k*rr3k + term1ik*rr3ik +
        term2i*rr5i + term2k*rr5k + term2ik*rr5ik +
        term3i*rr7i + term3k*rr7k + term3ik*rr7ik;
      term1 = -corek*rr3i - valk*rr3ik + dkr*rr5ik - qkr*rr7ik;
      term2 = corei*rr3k + vali*rr3ik + dir*rr5ik + qir*rr7ik;
      term3 = (numtyp)2.0 * rr5ik;
      term4 = (numtyp)-2.0 * (corek*rr5i+valk*rr5ik - dkr*rr7ik+qkr*rr9ik);
      term5 = (numtyp)-2.0 * (corei*rr5k+vali*rr5ik + dir*rr7ik+qir*rr9ik);
      term6 = (numtyp)4.0 * rr7ik;
      rr3 = rr3ik;

      energy += e;

      // compute the force components for this interaction

      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) +
        term4*qix + term5*qkx + term6*(qixk+qkxi);
      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) +
        term4*qiy + term5*qky + term6*(qiyk+qkyi);
      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
        term4*qiz + term5*qkz + term6*(qizk+qkzi);

      // compute the torque components for this interaction

      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) -
        term4*qirx - term6*(qikrx+qikx);
      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) -
        term4*qiry - term6*(qikry+qiky);
      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) -
        term4*qirz - term6*(qikrz+qikz);

      // increment force-based gradient and torque on first site

      f.x += frcx;
      f.y += frcy;
      f.z += frcz;
      tq.x += ttmix;
      tq.y += ttmiy;
      tq.z += ttmiz;

      if (EVFLAG && vflag) {
        numtyp vxx = -xr * frcx;
        numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
        numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
        numtyp vyy = -yr * frcy;
        numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
        numtyp vzz = -zr * frcz;

        virial[0] += vxx;
        virial[1] += vyy;
        virial[2] += vzz;
        virial[3] += vxy;
        virial[4] += vxz;
        virial[5] += vyz;
      }
    } // nbor

  } // ii<inum

  // accumulate tq
  store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);

  // accumate force, energy and virial: use _acc if not the first kernel
  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
     //offset,eflag,vflag,ans,engv);
  store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
}

/* ----------------------------------------------------------------------
  udirect2b = Ewald real direct field via list
  udirect2b computes the real space contribution of the permanent
   atomic multipole moments to the field via a neighbor list
------------------------------------------------------------------------- */

__kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict extra,
                                 const __global numtyp4 *restrict coeff,
                                 const __global numtyp4 *restrict sp_polar,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
                                 __global acctyp4 *restrict fieldp,
                                 const int inum,  const int nall,
                                 const int nbor_pitch, const int t_per_atom,
                                 const numtyp aewald, const numtyp off2,
                                 const numtyp polar_dscale, const numtyp polar_uscale)
{
  int tid, ii, offset, i;
  atom_info(t_per_atom,ii,tid,offset);

  int n_stride;
  local_allocate_store_charge();

  acctyp _fieldp[6];
  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;

  numtyp4* polar1 = (numtyp4*)(&extra[0]);
  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);

  if (ii<inum) {
    int numj, nbor, nbor_end;
    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    //numtyp qtmp; fetch(qtmp,i,q_tex);
    //int itype=ix.w;

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
      numj = dev_short_nbor[nbor];
      nbor += n_stride;
      nbor_end = nbor+fast_mul(numj,n_stride);
      nbor_mem = dev_short_nbor;
    }

    numtyp bn[4],bcn[3];
    numtyp fid[3],fip[3];

    const numtyp4 pol1i = polar1[i];
    numtyp dix = pol1i.y;    // rpole[i][1];
    numtyp diy = pol1i.z;    // rpole[i][2];
    numtyp diz = pol1i.w;    // rpole[i][3];
    const numtyp4 pol2i = polar2[i];
    numtyp qixx = pol2i.x;   // rpole[i][4];
    numtyp qixy = pol2i.y;   // rpole[i][5];
    numtyp qixz = pol2i.z;   // rpole[i][6];
    numtyp qiyy = pol2i.w;   // rpole[i][8];
    const numtyp4 pol3i = polar3[i];
    numtyp qiyz = pol3i.x;   // rpole[i][9];
    numtyp qizz = pol3i.y;   // rpole[i][12];
    int itype  = pol3i.z;    // amtype[i];
    int igroup = pol3i.w;    // amgroup[i];

    // debug:
    // xi__ = ix; xi__.w = itype;

    numtyp pdi = coeff[itype].x;
    numtyp pti = coeff[itype].y;
    numtyp ddi = coeff[itype].z;

    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
    numtyp aesq2n = (numtyp)0.0;
    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);

    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int jextra=nbor_mem[nbor];
      int j = jextra & NEIGHMASK15;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;

      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
      numtyp zr = jx.z - ix.z;
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;

      numtyp r = ucl_sqrt(r2);
      numtyp rinv = ucl_recip(r);
      numtyp r2inv = rinv*rinv;
      numtyp rr1 = rinv;
      numtyp rr3 = rr1 * r2inv;
      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;

      const numtyp4 pol1j = polar1[j];
      numtyp ck  = pol1j.x;  // rpole[j][0];
      numtyp dkx = pol1j.y;  // rpole[j][1];
      numtyp dky = pol1j.z;  // rpole[j][2];
      numtyp dkz = pol1j.w;  // rpole[j][3];
      const numtyp4 pol2j = polar2[j];
      numtyp qkxx = pol2j.x; // rpole[j][4];
      numtyp qkxy = pol2j.y; // rpole[j][5];
      numtyp qkxz = pol2j.z; // rpole[j][6];
      numtyp qkyy = pol2j.w; // rpole[j][8];
      const numtyp4 pol3j = polar3[j];
      numtyp qkyz = pol3j.x; // rpole[j][9];
      numtyp qkzz = pol3j.y; // rpole[j][12];
      int jtype = pol3j.z; // amtype[j];
      int jgroup =  pol3j.w; // amgroup[j];

      numtyp factor_dscale, factor_pscale;
      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
      if (igroup == jgroup) {
        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
        factor_dscale = polar_dscale;
      } else {
        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
        factor_dscale = (numtyp)1.0;
      }

      // intermediates involving moments and separation distance

      numtyp dir = dix*xr + diy*yr + diz*zr;
      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
      numtyp qir = qix*xr + qiy*yr + qiz*zr;
      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
      numtyp qkr = qkx*xr + qky*yr + qkz*zr;

      // calculate the real space Ewald error function terms

      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;

      numtyp aefac = aesq2n;
      for (int m = 1; m <= 3; m++) {
        numtyp bfac = (numtyp) (m+m-1);
        aefac = aesq2 * aefac;
        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
      }

      // find the field components for Thole polarization damping

      numtyp scale3 = (numtyp)1.0;
      numtyp scale5 = (numtyp)1.0;
      numtyp scale7 = (numtyp)1.0;
      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
      if (damp != (numtyp)0.0) {
        numtyp pgamma = MIN(ddi,coeff[jtype].z); // dirdamp[jtype]
        if (pgamma != (numtyp)0.0) {
          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
          if (damp < (numtyp)50.0) {
            numtyp expdamp = ucl_exp(-damp) ;
            scale3 = (numtyp)1.0 - expdamp ;
            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
          }
        } else {
          pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
          if (damp < (numtyp)50.0) {
            numtyp expdamp = ucl_exp(-damp);
            scale3 = (numtyp)1.0 - expdamp;
            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
          }
        }
      } else { // damp == 0: ???
      }

      numtyp scalek = factor_dscale;
      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;

      scalek = factor_pscale;
      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;

      _fieldp[0] += fid[0];
      _fieldp[1] += fid[1];
      _fieldp[2] += fid[2];
      _fieldp[3] += fip[0];
      _fieldp[4] += fip[1];
      _fieldp[5] += fip[2];
    }  // nbor

  } // ii<inum

  // accumulate field and fieldp

  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
}

/* ----------------------------------------------------------------------
  umutual2b = Ewald real mutual field via list
   umutual2b computes the real space contribution of the induced
   atomic dipole moments to the field via a neighbor list
------------------------------------------------------------------------- */

__kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict extra,
                                 const __global numtyp4 *restrict coeff,
                                 const __global numtyp4 *restrict sp_polar,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
                                 __global acctyp4 *restrict fieldp,
                                 const int inum,  const int nall,
                                 const int nbor_pitch, const int t_per_atom,
                                 const numtyp aewald, const numtyp off2,
                                 const numtyp polar_dscale, const numtyp polar_uscale)
{
  int tid, ii, offset, i;
  atom_info(t_per_atom,ii,tid,offset);

  int n_stride;
  local_allocate_store_charge();

  acctyp _fieldp[6];
  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;

  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);

  //numtyp4 xi__;

  if (ii<inum) {
    int numj, nbor, nbor_end;
    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    //numtyp qtmp; fetch(qtmp,i,q_tex);
    //int itype=ix.w;

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
      numj = dev_short_nbor[nbor];
      nbor += n_stride;
      nbor_end = nbor+fast_mul(numj,n_stride);
      nbor_mem = dev_short_nbor;
    }

    int itype,igroup;
    numtyp bn[4],bcn[3];
    numtyp fid[3],fip[3];

    itype  = polar3[i].z; // amtype[i];
    igroup = polar3[i].w; // amgroup[i];

    numtyp pdi = coeff[itype].x;
    numtyp pti = coeff[itype].y;

    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
    numtyp aesq2n = (numtyp)0.0;
    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);

    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int jextra=nbor_mem[nbor];
      int j = jextra & NEIGHMASK15;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;

      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
      numtyp zr = jx.z - ix.z;
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;

      numtyp r = ucl_sqrt(r2);
      numtyp rinv = ucl_recip(r);
      numtyp r2inv = rinv*rinv;
      numtyp rr1 = rinv;
      numtyp rr3 = rr1 * r2inv;
      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;

      const numtyp4 pol3j = polar3[j];
      int jtype = pol3j.z; // amtype[j];
      int jgroup =  pol3j.w; // amgroup[j];
      const numtyp4 pol4j = polar4[j];
      numtyp ukx = pol4j.x;  // uind[j][0];
      numtyp uky = pol4j.y;  // uind[j][1];
      numtyp ukz = pol4j.z;  // uind[j][2];
      const numtyp4 pol5j = polar5[j];
      numtyp ukxp = pol5j.x; // uinp[j][0];
      numtyp ukyp = pol5j.y; // uinp[j][1];
      numtyp ukzp = pol5j.z; // uinp[j][2];

      numtyp factor_uscale;
      if (igroup == jgroup) factor_uscale = polar_uscale;
      else factor_uscale = (numtyp)1.0;

      // calculate the real space Ewald error function terms

      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;

      numtyp aefac = aesq2n;
      for (int m = 1; m <= 3; m++) {
        numtyp bfac = (numtyp) (m+m-1);
        aefac = aesq2 * aefac;
        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
      }

      // find terms needed later to compute mutual polarization
      // if (poltyp != DIRECT)
      numtyp scale3 = (numtyp)1.0;
      numtyp scale5 = (numtyp)1.0;
      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
      if (damp != (numtyp)0.0) {
        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
        if (damp < (numtyp)50.0) {
          numtyp expdamp = ucl_exp(-damp);
          scale3 = (numtyp)1.0 - expdamp;
          scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
        }

      } else { // damp == 0: ???
      }

      numtyp scalek = factor_uscale;
      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;

      numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip
      tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
      tdipdip[1] = bcn[1]*xr*yr;
      tdipdip[2] = bcn[1]*xr*zr;
      tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
      tdipdip[4] = bcn[1]*yr*zr;
      tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
      //if (i==0 && j == 10)
      //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
      //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;

      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;

      _fieldp[0] += fid[0];
      _fieldp[1] += fid[1];
      _fieldp[2] += fid[2];
      _fieldp[3] += fip[0];
      _fieldp[4] += fip[1];
      _fieldp[5] += fip[2];
    }  // nbor

  } // ii<inum

  // accumulate field and fieldp

  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
}

/* ----------------------------------------------------------------------
   polar_real = real-space portion of induced dipole polarization
   adapted from Tinker epreal1d() routine
------------------------------------------------------------------------- */

__kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict extra,
                             const __global numtyp4 *restrict coeff,
                             const __global numtyp4 *restrict sp_polar,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
                             const __global int *dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
                             __global acctyp4 *restrict tep,
                             const int eflag, const int vflag, const int inum,
                             const int nall, const int nbor_pitch, const int t_per_atom,
                             const numtyp aewald, const numtyp felec,
                             const numtyp off2, const numtyp polar_dscale,
                             const numtyp polar_uscale)
{
  int tid, ii, offset, i;
  atom_info(t_per_atom,ii,tid,offset);

  int n_stride;
  local_allocate_store_ufld();
  local_allocate_store_charge();

  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
    energy=(acctyp)0;
    e_coul=(acctyp)0;
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }

  acctyp ufld[3];
  ufld[0] = (acctyp)0; ufld[1]=(acctyp)0; ufld[2]=(acctyp)0;
  acctyp dufld[6];
  for (int l=0; l<6; l++) dufld[l]=(acctyp)0;

  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
  numtyp4* polar1 = (numtyp4*)(&extra[0]);
  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);

  //numtyp4 xi__;

  if (ii<inum) {
    int k,m,itype,igroup;
    numtyp bfac;
    numtyp psc3,psc5,psc7;
    numtyp dsc3,dsc5,dsc7;
    numtyp usc3,usc5;
    numtyp psr3,psr5,psr7;
    numtyp dsr3,dsr5,dsr7;
    numtyp usr5;
    numtyp term1,term2,term3;
    numtyp term4,term5;
    numtyp term6,term7;
    numtyp rc3[3],rc5[3],rc7[3];
    numtyp prc3[3],prc5[3],prc7[3];
    numtyp drc3[3],drc5[3],drc7[3];
    numtyp urc3[3],urc5[3];
    numtyp bn[5];
    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;

    int numj, nbor, nbor_end;
    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    //numtyp qtmp; fetch(qtmp,i,q_tex);
    //int itype=ix.w;

    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
      numj = dev_short_nbor[nbor];
      nbor += n_stride;
      nbor_end = nbor+fast_mul(numj,n_stride);
      nbor_mem = dev_short_nbor;
    }

    const numtyp4 pol1i = polar1[i];
    ci  = pol1i.x;    // rpole[i][0];
    dix = pol1i.y;    // rpole[i][1];
    diy = pol1i.z;    // rpole[i][2];
    diz = pol1i.w;    // rpole[i][3];
    const numtyp4 pol2i = polar2[i];
    qixx = pol2i.x;   // rpole[i][4];
    qixy = pol2i.y;   // rpole[i][5];
    qixz = pol2i.z;   // rpole[i][6];
    qiyy = pol2i.w;   // rpole[i][8];
    const numtyp4 pol3i = polar3[i];
    qiyz = pol3i.x;   // rpole[i][9];
    qizz = pol3i.y;   // rpole[i][12];
    itype  = pol3i.z;    // amtype[i];
    igroup = pol3i.w;    // amgroup[i];
    const numtyp4 pol4i = polar4[i];
    uix = pol4i.x;    // uind[i][0];
    uiy = pol4i.y;    // uind[i][1];
    uiz = pol4i.z;    // uind[i][2];
    const numtyp4 pol5i = polar5[i];
    uixp = pol5i.x;   // uinp[i][0];
    uiyp = pol5i.y;   // uinp[i][1];
    uizp = pol5i.z;   // uinp[i][2];

    // debug:
    // xi__ = ix; xi__.w = itype;

    numtyp pdi = coeff[itype].x;
    numtyp pti = coeff[itype].y;

    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int jextra=nbor_mem[nbor];
      int j = jextra & NEIGHMASK15;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;

      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
      numtyp zr = jx.z - ix.z;
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;

      numtyp r = ucl_sqrt(r2);

      const numtyp4 pol1j = polar1[j];
      numtyp ck = polar1[j].x;   // rpole[j][0];
      numtyp dkx = polar1[j].y;  // rpole[j][1];
      numtyp dky = polar1[j].z;  // rpole[j][2];
      numtyp dkz = polar1[j].w;  // rpole[j][3];
      const numtyp4 pol2j = polar2[j];
      numtyp qkxx = pol2j.x; // rpole[j][4];
      numtyp qkxy = pol2j.y; // rpole[j][5];
      numtyp qkxz = pol2j.z; // rpole[j][6];
      numtyp qkyy = pol2j.w; // rpole[j][8];
      const numtyp4 pol3j = polar3[j];
      numtyp qkyz = pol3j.x; // rpole[j][9];
      numtyp qkzz = pol3j.y; // rpole[j][12];
      int jtype =   pol3j.z; // amtype[j];
      int jgroup =  pol3j.w; // amgroup[j];
      const numtyp4 pol4j = polar4[j];
      numtyp ukx = pol4j.x;  // uind[j][0];
      numtyp uky = pol4j.y;  // uind[j][1];
      numtyp ukz = pol4j.z;  // uind[j][2];
      const numtyp4 pol5j = polar5[j];
      numtyp ukxp = pol5j.x; // uinp[j][0];
      numtyp ukyp = pol5j.y; // uinp[j][1];
      numtyp ukzp = pol5j.z; // uinp[j][2];

      numtyp factor_dscale, factor_pscale, factor_uscale;
      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
      if (igroup == jgroup) {
        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
        factor_dscale = polar_dscale;
        factor_uscale = polar_uscale;
      } else {
        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
        factor_dscale = factor_uscale = (numtyp)1.0;
      }

      // intermediates involving moments and separation distance

      numtyp dir = dix*xr + diy*yr + diz*zr;
      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
      numtyp qir = qix*xr + qiy*yr + qiz*zr;
      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
      numtyp uir = uix*xr + uiy*yr + uiz*zr;
      numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
      numtyp ukr = ukx*xr + uky*yr + ukz*zr;
      numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;

      // get reciprocal distance terms for this interaction

      numtyp rinv = ucl_recip(r);
      numtyp r2inv = rinv*rinv;
      numtyp rr1 = felec * rinv;
      numtyp rr3 = rr1 * r2inv;
      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;

      // calculate the real space Ewald error function terms

      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;
      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
      numtyp alsq2n = (numtyp)0.0;
      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);

      for (m = 1; m <= 4; m++) {
        bfac = (numtyp) (m+m-1);
        alsq2n = alsq2 * alsq2n;
        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
      }
      for (m = 0; m < 5; m++) bn[m] *= felec;

      // apply Thole polarization damping to scale factors

      numtyp sc3 = (numtyp)1.0;
      numtyp sc5 = (numtyp)1.0;
      numtyp sc7 = (numtyp)1.0;
      for (k = 0; k < 3; k++) {
        rc3[k] = (numtyp)0.0;
        rc5[k] = (numtyp)0.0;
        rc7[k] = (numtyp)0.0;
      }

      // apply Thole polarization damping to scale factors

      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
      if (damp != (numtyp)0.0) {
        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
        if (damp < (numtyp)50.0) {
          numtyp expdamp = ucl_exp(-damp);
          sc3 = (numtyp)1.0 - expdamp;
          sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp;
          sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp;
          numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv;
          numtyp temp5 = damp;
          numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp;
          rc3[0] = xr * temp3;
          rc3[1] = yr * temp3;
          rc3[2] = zr * temp3;
          rc5[0] = rc3[0] * temp5;
          rc5[1] = rc3[1] * temp5;
          rc5[2] = rc3[2] * temp5;
          rc7[0] = rc5[0] * temp7;
          rc7[1] = rc5[1] * temp7;
          rc7[2] = rc5[2] * temp7;
        }

        psc3 = (numtyp)1.0 - sc3*factor_pscale;
        psc5 = (numtyp)1.0 - sc5*factor_pscale;
        psc7 = (numtyp)1.0 - sc7*factor_pscale;
        dsc3 = (numtyp)1.0 - sc3*factor_dscale;
        dsc5 = (numtyp)1.0 - sc5*factor_dscale;
        dsc7 = (numtyp)1.0 - sc7*factor_dscale;
        usc3 = (numtyp)1.0 - sc3*factor_uscale;
        usc5 = (numtyp)1.0 - sc5*factor_uscale;
        psr3 = bn[1] - psc3*rr3;
        psr5 = bn[2] - psc5*rr5;
        psr7 = bn[3] - psc7*rr7;
        dsr3 = bn[1] - dsc3*rr3;
        dsr5 = bn[2] - dsc5*rr5;
        dsr7 = bn[3] - dsc7*rr7;
        usr5 = bn[2] - usc5*rr5;
        for (k = 0; k < 3; k++) {
          prc3[k] = rc3[k] * factor_pscale;
          prc5[k] = rc5[k] * factor_pscale;
          prc7[k] = rc7[k] * factor_pscale;
          drc3[k] = rc3[k] * factor_dscale;
          drc5[k] = rc5[k] * factor_dscale;
          drc7[k] = rc7[k] * factor_dscale;
          urc3[k] = rc3[k] * factor_uscale;
          urc5[k] = rc5[k] * factor_uscale;
        }
      } else { // damp == 0: ???
      }

      // get the induced dipole field used for dipole torques

      numtyp tix3 = psr3*ukx + dsr3*ukxp;
      numtyp tiy3 = psr3*uky + dsr3*ukyp;
      numtyp tiz3 = psr3*ukz + dsr3*ukzp;
      numtyp tuir = -psr5*ukr - dsr5*ukrp;

      ufld[0] += tix3 + xr*tuir;
      ufld[1] += tiy3 + yr*tuir;
      ufld[2] += tiz3 + zr*tuir;

      // get induced dipole field gradient used for quadrupole torques

      numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp);
      numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp);
      numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp);
      tuir = -psr7*ukr - dsr7*ukrp;

      dufld[0] += xr*tix5 + xr*xr*tuir;
      dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
      dufld[2] += yr*tiy5 + yr*yr*tuir;
      dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
      dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
      dufld[5] += zr*tiz5 + zr*zr*tuir;

      // get the dEd/dR terms used for direct polarization force

      term1 = bn[2] - dsc3*rr5;
      term2 = bn[3] - dsc5*rr7;
      term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0];
      term4 = rr3*drc3[0] - term1*xr - dsr5*xr;
      term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0];
      term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
      term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
      numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
      numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
        (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;

      term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1];
      term4 = rr3*drc3[1] - term1*yr - dsr5*yr;
      term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1];
      term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1];
      term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr;
      numtyp tiyy = ci*term3 + diy*term4 + dir*term5 +
        (numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
      numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 +
        (numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;

      term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2];
      term4 = rr3*drc3[2] - term1*zr - dsr5*zr;
      term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2];
      term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2];
      term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr;
      numtyp tizz = ci*term3 + diz*term4 + dir*term5 +
        (numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
      numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 +
        (numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;

      term3 = term1*xr*yr - rr3*yr*drc3[0];
      term4 = rr3*drc3[0] - term1*xr;
      term5 = term2*xr*yr - rr5*yr*drc5[0];
      term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0];
      term7 = rr5*drc5[0] - term2*xr;
      numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 +
        (numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
      numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 +
        (numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6;

      term3 = term1*xr*zr - rr3*zr*drc3[0];
      term5 = term2*xr*zr - rr5*zr*drc5[0];
      term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0];
      numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 +
        (numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
      numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 +
        (numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;

      term3 = term1*yr*zr - rr3*zr*drc3[1];
      term4 = rr3*drc3[1] - term1*yr;
      term5 = term2*yr*zr - rr5*zr*drc5[1];
      term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1];
      term7 = rr5*drc5[1] - term2*yr;
      numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 +
        (numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
      numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 +
        (numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;

      numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp;
      numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp;
      numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp;

      numtyp frcx = depx;
      numtyp frcy = depy;
      numtyp frcz = depz;

      // get the dEp/dR terms used for direct polarization force

      // tixx and tkxx
      term1 = bn[2] - psc3*rr5;
      term2 = bn[3] - psc5*rr7;
      term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0];
      term4 = rr3*prc3[0] - term1*xr - psr5*xr;
      term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0];
      term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0];
      term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr;
      tixx = ci*term3 + dix*term4 + dir*term5 +
        (numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
      tkxx = ck*term3 - dkx*term4 - dkr*term5 +
        (numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;

      // tiyy and tkyy
      term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1];
      term4 = rr3*prc3[1] - term1*yr - psr5*yr;
      term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1];
      term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1];
      term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr;
      tiyy = ci*term3 + diy*term4 + dir*term5 +
        (numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
      tkyy = ck*term3 - dky*term4 - dkr*term5 +
        (numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;

      // tizz and tkzz
      term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2];
      term4 = rr3*prc3[2] - term1*zr - psr5*zr;
      term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2];
      term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2];
      term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr;
      tizz = ci*term3 + diz*term4 + dir*term5 +
        (numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
      tkzz = ck*term3 - dkz*term4 - dkr*term5 +
        (numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;

      // tixy and tkxy
      term3 = term1*xr*yr - rr3*yr*prc3[0];
      term4 = rr3*prc3[0] - term1*xr;
      term5 = term2*xr*yr - rr5*yr*prc5[0];
      term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0];
      term7 = rr5*prc5[0] - term2*xr;
      tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 +
        (numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
      tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 +
        (numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6;

      // tixz and tkxz
      term3 = term1*xr*zr - rr3*zr*prc3[0];
      term5 = term2*xr*zr - rr5*zr*prc5[0];
      term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0];
      tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 +
        (numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
      tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 +
        (numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;

      // tiyz and tkyz
      term3 = term1*yr*zr - rr3*zr*prc3[1];
      term4 = rr3*prc3[1] - term1*yr;
      term5 = term2*yr*zr - rr5*zr*prc5[1];
      term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1];
      term7 = rr5*prc5[1] - term2*yr;
      tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 +
        (numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
      tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 +
        (numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;

      depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz;
      depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
      depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;

      frcx = frcx + depx;
      frcy = frcy + depy;
      frcz = frcz + depz;

      // get the dtau/dr terms used for mutual polarization force
      // poltyp == MUTUAL  && hippo

      term1 = bn[2] - usc3*rr5;
      term2 = bn[3] - usc5*rr7;
      term3 = usr5 + term1;
      term4 = rr3 * factor_uscale;
      term5 = -xr*term3 + rc3[0]*term4;
      term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0];
      tixx = uix*term5 + uir*term6;
      tkxx = ukx*term5 + ukr*term6;

      term5 = -yr*term3 + rc3[1]*term4;
      term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1];
      tiyy = uiy*term5 + uir*term6;
      tkyy = uky*term5 + ukr*term6;

      term5 = -zr*term3 + rc3[2]*term4;
      term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2];
      tizz = uiz*term5 + uir*term6;
      tkzz = ukz*term5 + ukr*term6;

      term4 = -usr5 * yr;
      term5 = -xr*term1 + rr3*urc3[0];
      term6 = xr*yr*term2 - rr5*yr*urc5[0];
      tixy = uix*term4 + uiy*term5 + uir*term6;
      tkxy = ukx*term4 + uky*term5 + ukr*term6;

      term4 = -usr5 * zr;
      term6 = xr*zr*term2 - rr5*zr*urc5[0];
      tixz = uix*term4 + uiz*term5 + uir*term6;
      tkxz = ukx*term4 + ukz*term5 + ukr*term6;

      term5 = -yr*term1 + rr3*urc3[1];
      term6 = yr*zr*term2 - rr5*zr*urc5[1];
      tiyz = uiy*term4 + uiz*term5 + uir*term6;
      tkyz = uky*term4 + ukz*term5 + ukr*term6;

      depx = tixx*ukxp + tixy*ukyp + tixz*ukzp
        + tkxx*uixp + tkxy*uiyp + tkxz*uizp;
      depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp
        + tkxy*uixp + tkyy*uiyp + tkyz*uizp;
      depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp
        + tkxz*uixp + tkyz*uiyp + tkzz*uizp;

      frcx = frcx + depx;
      frcy = frcy + depy;
      frcz = frcz + depz;

      f.x -= frcx;
      f.y -= frcy;
      f.z -= frcz;

      if (EVFLAG && vflag) {
        numtyp vxx = xr * frcx;
        numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy);
        numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz);
        numtyp vyy = yr * frcy;
        numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz);
        numtyp vzz = zr * frcz;

        virial[0] += vxx;
        virial[1] += vyy;
        virial[2] += vzz;
        virial[3] += vxy;
        virial[4] += vxz;
        virial[5] += vyz;
      }
    } // nbor

  } // ii<inum

  // accumulate ufld and dufld to compute tep
  store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);

  // accumate force, energy and virial
  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
  //     offset,eflag,vflag,ans,engv);
  store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
}

/* ----------------------------------------------------------------------
   scan standard neighbor list and make it compatible with 1-5 neighbors
   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
   else do nothing to IJ entry
------------------------------------------------------------------------- */

__kernel void k_special15(__global int * dev_nbor,
                          const __global int * dev_packed,
                          const __global tagint *restrict tag,
                          const __global int *restrict nspecial15,
                          const __global tagint *restrict special15,
                          const int inum, const int nall, const int nbor_pitch,
                          const int t_per_atom) {
  int tid, ii, offset, n_stride, i;
  atom_info(t_per_atom,ii,tid,offset);

  if (ii<inum) {

    int numj, nbor, nbor_end;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

    int n15 = nspecial15[ii];

    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int sj=dev_packed[nbor];
      int which = sj >> SBBITS & 3;
      int j = sj & NEIGHMASK;
      tagint jtag = tag[j];

      if (!which) {
        int offset=ii;
        for (int k=0; k<n15; k++) {
          if (special15[offset] == jtag) {
            which = 4;
            break;
          }
          offset += nall;
        }
      }

      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
    } // for nbor

  } // if ii
}

__kernel void k_hippo_short_nbor(const __global numtyp4 *restrict x_,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
                                  __global int * dev_short_nbor,
                                  const numtyp off2,
                                  const int inum, const int nbor_pitch,
                                  const int t_per_atom) {
  __local int n_stride;
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];

    int ncount = 0;
    int m = nbor;
    dev_short_nbor[m] = 0;
    int nbor_short = nbor+n_stride;

    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int j=dev_packed[nbor];
      int nj = j;
      j &= NEIGHMASK15;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];

      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

      if (rsq<off2) {
        dev_short_nbor[nbor_short] = nj;
        nbor_short += n_stride;
        ncount++;
      }
    } // for nbor

    // store the number of neighbors for each thread
    dev_short_nbor[m] = ncount;

  } // if ii
}