lammps/lib/gpu/pppm_gpu_kernel.cu

/* ----------------------------------------------------------------------
   LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */

#ifndef PPPM_GPU_KERNEL
#define PPPM_GPU_KERNEL

#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif

#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif

#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif

#ifdef NV_KERNEL

#include "geryon/ucl_nv_kernel.h"
texture<float4> pos_tex;
texture<float> q_tex;

#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
  return pos[i];
}
__inline double fetch_q(const int& i, const double *q)
{
  return q[i];
}

#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
  return tex1Dfetch(pos_tex, i);
}
__inline float fetch_q(const int& i, const float *q)
{
  return tex1Dfetch(q_tex, i);
}

#endif

#else

#pragma OPENCL EXTENSION cl_khr_fp64: enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define GLOBAL_SIZE_X get_global_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline

#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define MEM_THREADS 16

#endif

// Maximum order for spline
#define MAX_SPLINE 8
// Thread block size for all kernels (Must be >=MAX_SPLINE^2)
#define BLOCK_1D 64
// Number of threads per pencil for charge spread
//#define PENCIL_SIZE MEM_THREADS
#define PENCIL_SIZE 32
// Number of pencils per block for charge spread
//#define BLOCK_PENCILS (BLOCK_1D/PENCIL_SIZE)
#define BLOCK_PENCILS 2

__kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,
                           const numtyp delvolinv, const int nlocal,
                           __global int *counts, __global numtyp4 *ans,
                           const numtyp b_lo_x, const numtyp b_lo_y,
                           const numtyp b_lo_z, const numtyp delxinv,
                           const numtyp delyinv, const numtyp delzinv,
                           const int nlocal_x, const int nlocal_y,
                           const int nlocal_z, const int atom_stride,
                           const int max_atoms, __global int *error) {
  // ii indexes the two interacting particles in gi
  int ii=GLOBAL_ID_X;

  // Resequence the atom indices to avoid collisions during atomic ops
  int nthreads=GLOBAL_SIZE_X;
  ii=mul24(ii,BLOCK_1D);
  ii-=int(ii/nthreads)*(nthreads-1);

  int nx,ny,nz;

  if (ii<nlocal) {
    numtyp4 p=fetch_pos(ii,x_);
    numtyp4 delta;
    delta.w=delvolinv*fetch_q(ii,q_);

    delta.x=(p.x-b_lo_x)*delxinv;
    nx=int(delta.x);
    delta.y=(p.y-b_lo_y)*delyinv;
    ny=int(delta.y);
    delta.z=(p.z-b_lo_z)*delzinv;
    nz=int(delta.z);

    if (delta.x<0 || delta.y<0 || delta.z<0 ||
        nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z)
      *error=1;
    else {
      delta.x=nx+0.5-delta.x;
      delta.y=ny+0.5-delta.y;
      delta.z=nz+0.5-delta.z;

      int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx;
      int old=atom_add(counts+i, 1);
      if (old==max_atoms) {
        *error=2;
        atom_add(counts+i, -1);
      } else
        ans[atom_stride*old+i]=delta;
    }
  }
}

/* --------------------------- */

__kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_,
                       __global int *counts, __global numtyp4 *atoms,
                       __global numtyp *brick, __global numtyp *_rho_coeff,
                       const int atom_stride, const int npts_x,
                       const int npts_y, const int npts_z, const int nlocal_x,
                       const int nlocal_y, const int nlocal_z,
                       const int order_m_1, const int order, const int order2) {
  __local numtyp rho_coeff[MAX_SPLINE*MAX_SPLINE];
  __local numtyp front[BLOCK_PENCILS][PENCIL_SIZE+MAX_SPLINE];
  __local numtyp ans[MAX_SPLINE][BLOCK_1D];

  int tid=THREAD_ID_X;
  if (tid<order2+order)
    rho_coeff[tid]=_rho_coeff[tid];

  int pid=tid/PENCIL_SIZE;
  int fid=tid%PENCIL_SIZE;
  int fid_halo=PENCIL_SIZE+fid;
  if (fid<order)
    front[pid][fid_halo]=(numtyp)0.0;

  __syncthreads();

  int bt=BLOCK_ID_X*BLOCK_PENCILS+pid;
  int ny=bt%npts_y;
  int nz=bt/npts_y;
  int y_start=0;
  int z_start=0;
  int y_stop=order;
  int z_stop=order;
  if (ny<order_m_1)
    y_start=order_m_1-ny;
  if (nz<order_m_1)
    z_start=order_m_1-nz;
  if (ny>=nlocal_y)
    y_stop-=ny-nlocal_y+1;
  if (nz>=nlocal_z)
    z_stop-=nz-nlocal_z+1;
  int z_stride=mul24(nlocal_x,nlocal_y);

  int loop_count=npts_x/PENCIL_SIZE+1;
  int nx=fid;
  int pt=mul24(nz,mul24(npts_y,npts_x))+mul24(ny,npts_x)+nx;
  for (int i=0 ; i<loop_count; i++) {
    for (int n=0; n<order; n++)
      ans[n][tid]=(numtyp)0.0;
    if (nx<nlocal_x && nz<npts_z) {
      int z_pos=mul24(nz+z_start-order_m_1,z_stride);
      for (int m=z_start; m<z_stop; m++) {
        int y_pos=mul24(ny+y_start-order_m_1,nlocal_x);
        for (int l=y_start; l<y_stop; l++) {
          int pos=z_pos+y_pos+nx;
          int natoms=mul24(counts[pos],atom_stride);
          for (int row=pos; row<natoms; row+=atom_stride) {
            numtyp4 delta=atoms[row];

            numtyp rho1d_1=(numtyp)0.0;
            numtyp rho1d_2=(numtyp)0.0;
            for (int k=order2+order-1; k > -1; k-=order) {
              rho1d_1=rho_coeff[k-l]+rho1d_1*delta.y;
              rho1d_2=rho_coeff[k-m]+rho1d_2*delta.z;
            }
            delta.w*=rho1d_1*rho1d_2;

            for (int n=0; n<order; n++) {
              numtyp rho1d_0=(numtyp)0.0;
              for (int k=order2+n; k>=n; k-=order)
                rho1d_0=rho_coeff[k]+rho1d_0*delta.x;
              ans[n][tid]+=delta.w*rho1d_0;
            }
          }
          y_pos+=nlocal_x;
        }
        z_pos+=z_stride;
      }
    }

    __syncthreads();
    if (fid<order) {
      front[pid][fid]=front[pid][fid_halo];
      front[pid][fid_halo]=(numtyp)0.0;
    } else
      front[pid][fid]=(numtyp)0.0;

    for (int n=0; n<order; n++) {
      front[pid][fid+n]+=ans[n][tid];
      __syncthreads();
    }

    if (nx<npts_x && nz<npts_z)
      brick[pt]=front[pid][fid];
    pt+=PENCIL_SIZE;
    nx+=PENCIL_SIZE;
  }
}

__kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
                     const int nlocal, __global numtyp4 *brick,
                     __global numtyp *_rho_coeff, const int npts_x,
                     const int npts_yx, const numtyp b_lo_x,
                     const numtyp b_lo_y, const numtyp b_lo_z,
                     const numtyp delxinv,  const numtyp delyinv,
                     const numtyp delzinv, const int order,
                     const int order2, const numtyp qqrd2e_scale,
                     __global acctyp4 *ans) {
  __local numtyp rho_coeff[MAX_SPLINE*MAX_SPLINE];
  __local numtyp rho1d_0[MAX_SPLINE][BLOCK_1D];
  __local numtyp rho1d_1[MAX_SPLINE][BLOCK_1D];

  int tid=THREAD_ID_X;
  if (tid<order2+order)
    rho_coeff[tid]=_rho_coeff[tid];
  __syncthreads();

  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;

  int nx,ny,nz;
  numtyp tx,ty,tz;

  if (ii<nlocal) {
    numtyp4 p=fetch_pos(ii,x_);
    numtyp qs=qqrd2e_scale*fetch_q(ii,q_);

    tx=(p.x-b_lo_x)*delxinv;
    nx=int(tx);
    ty=(p.y-b_lo_y)*delyinv;
    ny=int(ty);
    tz=(p.z-b_lo_z)*delzinv;
    nz=int(tz);

    numtyp dx=nx+(numtyp)0.5-tx;
    numtyp dy=ny+(numtyp)0.5-ty;
    numtyp dz=nz+(numtyp)0.5-tz;

    for (int k=0; k<order; k++) {
      rho1d_0[k][tid]=(numtyp)0.0;
      rho1d_1[k][tid]=(numtyp)0.0;
      for (int l=order2+k; l>=k; l-=order) {
        rho1d_0[k][tid]=rho_coeff[l]+rho1d_0[k][tid]*dx;
        rho1d_1[k][tid]=rho_coeff[l]+rho1d_1[k][tid]*dy;
      }
    }

    numtyp4 ek;
    ek.x=(acctyp)0.0;
    ek.y=(acctyp)0.0;
    ek.z=(acctyp)0.0;
    int mz=mul24(nz,npts_yx)+nx;
    for (int n=0; n<order; n++) {
      numtyp rho1d_2=(numtyp)0.0;
      for (int k=order2+n; k>=n; k-=order)
        rho1d_2=rho_coeff[k]+rho1d_2*dz;
      numtyp z0=qs*rho1d_2;
      int my=mz+mul24(ny,npts_x);
      for (int m=0; m<order; m++) {
        numtyp y0=z0*rho1d_1[m][tid];
	      for (int l=0; l<order; l++) {
	        numtyp x0=y0*rho1d_0[l][tid];
	        numtyp4 el=brick[my+l];
	        ek.x-=x0*el.x;
	        ek.y-=x0*el.y;
	        ek.z-=x0*el.z;
	      }
        my+=npts_x;
      }
      mz+=npts_yx;
	  }
    ans[ii]=ek;
	}
}

#endif