/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #ifndef PPPM_GPU_KERNEL #define PPPM_GPU_KERNEL #define MAX_STENCIL 8 #define BLOCK_1D 64 #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 #define numtyp4 double4 #define acctyp double #define acctyp4 double4 #endif #ifdef _SINGLE_DOUBLE #define numtyp float #define numtyp2 float2 #define numtyp4 float4 #define acctyp double #define acctyp4 double4 #endif #ifndef numtyp #define numtyp float #define numtyp2 float2 #define numtyp4 float4 #define acctyp float #define acctyp4 float4 #endif #ifdef NV_KERNEL #include "geryon/ucl_nv_kernel.h" texture pos_tex; texture q_tex; #ifdef _DOUBLE_DOUBLE __inline double4 fetch_pos(const int& i, const double4 *pos) { return pos[i]; } __inline double fetch_q(const int& i, const double *q) { return q[i]; } __device__ inline void atomicFloatAdd(double* address, double val) { double old = *address, assumed; do { assumed = old; old = __longlong_as_double( atomicCAS((unsigned long long int*)address, __double_as_longlong(assumed), __double_as_longlong(val + assumed))); } while (assumed != old); } #else __inline float4 fetch_pos(const int& i, const float4 *pos) { return tex1Dfetch(pos_tex, i); } __inline float fetch_q(const int& i, const float *q) { return tex1Dfetch(q_tex, i); } __device__ inline void atomicFloatAdd(float *address, float val) { int i_val = __float_as_int(val); int tmp0 = 0; int tmp1; while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) { tmp0 = tmp1; i_val = __float_as_int(val + __int_as_float(tmp1)); } } #endif #else #pragma OPENCL EXTENSION cl_khr_fp64: enable #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable #define GLOBAL_ID_X get_global_id(0) #define THREAD_ID_X get_local_id(0) #define BLOCK_ID_X get_group_id(0) #define BLOCK_SIZE_X get_local_size(0) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) #define __inline inline #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] #endif __kernel void particle_map(__global numtyp4 *x_, const int nlocal, __global int *counts, __global int *ans, const numtyp b_lo_x, const numtyp b_lo_y, const numtyp b_lo_z, const numtyp delxinv, const numtyp delyinv, const numtyp delzinv, const int nlocal_x, const int nlocal_y, const int nlocal_z, const int atom_stride, const int max_atoms, __global int *error) { // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X; int nx,ny,nz; numtyp tx,ty,tz; if (ii=nlocal_x || ny>=nlocal_y || nz>=nlocal_z) *error=1; else { int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx; int old=atom_add(counts+i, 1); if (old==max_atoms) { *error=2; atom_add(counts+i,-1); } else ans[atom_stride*old+i]=ii; } } } __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, __global int *counts, __global int *atoms, __global numtyp *brick, __global numtyp *_rho_coeff, const int atom_stride, const int npts_x, const int npts_y, const int nlocal_x, const int nlocal_y, const int nlocal_z, const numtyp b_lo_x, const numtyp b_lo_y, const numtyp b_lo_z, const numtyp delxinv, const numtyp delyinv, const numtyp delzinv, const int order, const numtyp delvolinv) { __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL]; int nx=THREAD_ID_X; int ny=THREAD_ID_Y; if (nx= 0; l--) { rho1d[0][k] = rho_coeff[l*order+k] + rho1d[0][k]*dx; rho1d[1][k] = rho_coeff[l*order+k] + rho1d[1][k]*dy; } } for (int n = 0; n < order; n++) { numtyp rho1d_2 = 0.0; for (int k = order-1; k >= 0; k--) rho1d_2 = rho_coeff[k*order+n] + rho1d_2*dz; numtyp y0 = z0*rho1d_2; int mz = (n+nz)*npts_y*npts_x + ny*npts_x +nx; for (int m = 0; m < order; m++) { numtyp x0 = y0*rho1d[1][m]; for (int l = 0; l < order; l++) { atomicFloatAdd(brick+mz+l,x0*rho1d[0][l]); } mz+=npts_x; } } } z_pos+=z_stride; } } } /* --------------------------- */ __kernel void make_rho2(__global numtyp4 *x_, __global numtyp *q_, __global int *counts, __global int *atoms, __global numtyp *brick, __global numtyp *_rho_coeff, const int atom_stride, const int npts_x, const int npts_y, const int nlocal_x, const int nlocal_y, const int nlocal_z, const numtyp b_lo_x, const numtyp b_lo_y, const numtyp b_lo_z, const numtyp delxinv, const numtyp delyinv, const numtyp delzinv, const int order, const numtyp delvolinv) { __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL]; int nx=BLOCK_ID_X; int ny=BLOCK_ID_Y; int tx=THREAD_ID_X; if (tx= 0; l--) { rho1d[0][k] = rho_coeff[l*order+k] + rho1d[0][k]*dx; rho1d[1][k] = rho_coeff[l*order+k] + rho1d[1][k]*dy; } } for (int n = 0; n < order; n++) { numtyp rho1d_2 = 0.0; for (int k = order-1; k >= 0; k--) rho1d_2 = rho_coeff[k*order+n] + rho1d_2*dz; numtyp y0 = z0*rho1d_2; int mz = (n+nz)*npts_y*npts_x + ny*npts_x +nx; for (int m = 0; m < order; m++) { numtyp x0 = y0*rho1d[1][m]; for (int l = 0; l < order; l++) { atomicFloatAdd(brick+mz+l,x0*rho1d[0][l]); } mz+=npts_x; } } } z_pos+=z_stride; } } } /* --------------------------- */ __kernel void make_rho3(__global numtyp4 *x_, __global numtyp *q_, const int nlocal, __global numtyp *brick, __global numtyp *_rho_coeff, const int npts_x, const int npts_y, const int nlocal_x, const int nlocal_y, const int nlocal_z, const numtyp b_lo_x, const numtyp b_lo_y, const numtyp b_lo_z, const numtyp delxinv, const numtyp delyinv, const numtyp delzinv, const numtyp shift, const int order, const numtyp delvolinv, __global int *error) { __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL]; int ii=THREAD_ID_X; if (ii=nlocal_x || ny>=nlocal_y || nz>=nlocal_z) *error=1; else { numtyp z0=delvolinv*fetch_q(ii,q_); numtyp dx = nx+shift - tx; numtyp dy = ny+shift - ty; numtyp dz = nz+shift - tz; numtyp rho1d[2][MAX_STENCIL]; for (int k = 0; k < order; k++) { rho1d[0][k] = 0.0; rho1d[1][k] = 0.0; for (int l = order-1; l >= 0; l--) { rho1d[0][k] = rho_coeff[l*order+k] + rho1d[0][k]*dx; rho1d[1][k] = rho_coeff[l*order+k] + rho1d[1][k]*dy; } } for (int n = 0; n < order; n++) { numtyp rho1d_2 = 0.0; for (int k = order-1; k >= 0; k--) rho1d_2 = rho_coeff[k*order+n] + rho1d_2*dz; numtyp y0 = z0*rho1d_2; int mz = (n+nz)*npts_y*npts_x + ny*npts_x +nx; for (int m = 0; m < order; m++) { numtyp x0 = y0*rho1d[1][m]; for (int l = 0; l < order; l++) { atomicFloatAdd(brick+mz+l,x0*rho1d[0][l]); } mz+=npts_x; } } } } } #endif