/* ---------------------------------------------------------------------- LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #ifndef PPPM_GPU_KERNEL #define PPPM_GPU_KERNEL #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 #define numtyp4 double4 #define acctyp double #define acctyp4 double4 #endif #ifdef _SINGLE_DOUBLE #define numtyp float #define numtyp2 float2 #define numtyp4 float4 #define acctyp double #define acctyp4 double4 #endif #ifndef numtyp #define numtyp float #define numtyp2 float2 #define numtyp4 float4 #define acctyp float #define acctyp4 float4 #endif #ifdef NV_KERNEL #include "geryon/ucl_nv_kernel.h" texture pos_tex; texture q_tex; #ifdef _DOUBLE_DOUBLE __inline double4 fetch_pos(const int& i, const double4 *pos) { return pos[i]; } __inline double fetch_q(const int& i, const double *q) { return q[i]; } #else __inline float4 fetch_pos(const int& i, const float4 *pos) { return tex1Dfetch(pos_tex, i); } __inline float fetch_q(const int& i, const float *q) { return tex1Dfetch(q_tex, i); } #endif #else #pragma OPENCL EXTENSION cl_khr_fp64: enable #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable #define GLOBAL_ID_X get_global_id(0) #define THREAD_ID_X get_local_id(0) #define BLOCK_ID_X get_group_id(0) #define BLOCK_SIZE_X get_local_size(0) #define GLOBAL_SIZE_X get_global_size(0) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) #define __inline inline #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] #define MEM_THREADS 16 #endif // Maximum order for spline #define MAX_SPLINE 8 // Thread block size for all kernels (Must be >=MAX_SPLINE^2) #define BLOCK_1D 64 // Number of threads per pencil for charge spread //#define PENCIL_SIZE MEM_THREADS #define PENCIL_SIZE 32 // Number of pencils per block for charge spread //#define BLOCK_PENCILS (BLOCK_1D/PENCIL_SIZE) #define BLOCK_PENCILS 2 __kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_, const numtyp delvolinv, const int nlocal, __global int *counts, __global numtyp4 *ans, const numtyp b_lo_x, const numtyp b_lo_y, const numtyp b_lo_z, const numtyp delxinv, const numtyp delyinv, const numtyp delzinv, const int nlocal_x, const int nlocal_y, const int nlocal_z, const int atom_stride, const int max_atoms, __global int *error) { // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X; // Resequence the atom indices to avoid collisions during atomic ops int nthreads=GLOBAL_SIZE_X; ii=mul24(ii,BLOCK_1D); ii-=int(ii/nthreads)*(nthreads-1); int nx,ny,nz; if (ii=nlocal_x || ny>=nlocal_y || nz>=nlocal_z) *error=1; else { delta.x=nx+0.5-delta.x; delta.y=ny+0.5-delta.y; delta.z=nz+0.5-delta.z; int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx; int old=atom_add(counts+i, 1); if (old==max_atoms) { *error=2; atom_add(counts+i, -1); } else ans[atom_stride*old+i]=delta; } } } /* --------------------------- */ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, __global int *counts, __global numtyp4 *atoms, __global numtyp *brick, __global numtyp *_rho_coeff, const int atom_stride, const int npts_x, const int npts_y, const int npts_z, const int nlocal_x, const int nlocal_y, const int nlocal_z, const int order_m_1, const int order, const int order2) { __local numtyp rho_coeff[MAX_SPLINE*MAX_SPLINE]; __local numtyp front[BLOCK_PENCILS][PENCIL_SIZE+MAX_SPLINE]; __local numtyp ans[MAX_SPLINE][BLOCK_1D]; int tid=THREAD_ID_X; if (tid=nlocal_y) y_stop-=ny-nlocal_y+1; if (nz>=nlocal_z) z_stop-=nz-nlocal_z+1; int z_stride=mul24(nlocal_x,nlocal_y); int loop_count=npts_x/PENCIL_SIZE+1; int nx=fid; int pt=mul24(nz,mul24(npts_y,npts_x))+mul24(ny,npts_x)+nx; for (int i=0 ; i -1; k-=order) { rho1d_1=rho_coeff[k-l]+rho1d_1*delta.y; rho1d_2=rho_coeff[k-m]+rho1d_2*delta.z; } delta.w*=rho1d_1*rho1d_2; for (int n=0; n=n; k-=order) rho1d_0=rho_coeff[k]+rho1d_0*delta.x; ans[n][tid]+=delta.w*rho1d_0; } } y_pos+=nlocal_x; } z_pos+=z_stride; } } __syncthreads(); if (fid=k; l-=order) { rho1d_0[k][tid]=rho_coeff[l]+rho1d_0[k][tid]*dx; rho1d_1[k][tid]=rho_coeff[l]+rho1d_1[k][tid]*dy; } } numtyp4 ek; ek.x=(acctyp)0.0; ek.y=(acctyp)0.0; ek.z=(acctyp)0.0; int mz=mul24(nz,npts_yx)+nx; for (int n=0; n=n; k-=order) rho1d_2=rho_coeff[k]+rho1d_2*dz; numtyp z0=qs*rho1d_2; int my=mz+mul24(ny,npts_x); for (int m=0; m