diff --git a/lib/gpu/cmmc_msm_gpu_kernel.cu b/lib/gpu/cmmc_msm_gpu_kernel.cu new file mode 100644 index 0000000000..09fad801eb --- /dev/null +++ b/lib/gpu/cmmc_msm_gpu_kernel.cu @@ -0,0 +1,531 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef CMMM_GPU_KERNEL +#define CMMM_GPU_KERNEL + +#ifdef NV_KERNEL + +#include "nv_kernel_def.h" +texture pos_tex; +texture q_tex; + +#ifdef _DOUBLE_DOUBLE +__inline double4 fetch_pos(const int& i, const double4 *pos) +{ + return pos[i]; +} +__inline double fetch_q(const int& i, const double *q) +{ + return q[i]; +} +#else +__inline float4 fetch_pos(const int& i, const float4 *pos) +{ + return tex1Dfetch(pos_tex, i); +} +__inline float fetch_q(const int& i, const float *q) +{ + return tex1Dfetch(q_tex, i); +} +#endif + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define __inline inline + +#define fetch_pos(i,y) x_[i] +#define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 + +#endif + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#define SBBITS 30 +#define NEIGHMASK 0x3FFFFFFF +__inline int sbmask(int j) { return j >> SBBITS & 3; } + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const int smooth, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + __local numtyp _ia; + __local numtyp _ia2; + __local numtyp _ia3; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + if (rsq < cut_coulsq) + if (smooth==0) + e_coul += prefactor*(ir+_ia*((numtyp)2.1875-(numtyp)2.1875*r2_ia2+ + (numtyp)1.3125*r4_ia4- + (numtyp)0.3125*r4_ia4*r2_ia2)- + factor_coul*ir); + else + e_coul += prefactor*(ir+_ia*((numtyp)2.4609375-(numtyp)3.28125* + r2_ia2+(numtyp)2.953125*r4_ia4- + (numtyp)1.40625*r6_ia6+ + (numtyp)0.2734375*r4_ia4*r4_ia4)); + + if (rsq < lj1[mtype].y) { + energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)- + lj3[mtype].w; + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { + *ap1=energy; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const int smooth, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) { + if (rsq < cut_coulsq) + if (smooth==0) + e_coul += prefactor*(ir+_ia*((numtyp)2.1875-(numtyp)2.1875*r2_ia2+ + (numtyp)1.3125*r4_ia4- + (numtyp)0.3125*r4_ia4*r2_ia2)- + factor_coul*ir); + else + e_coul += prefactor*(ir+_ia*((numtyp)2.4609375-(numtyp)3.28125* + r2_ia2+(numtyp)2.953125*r4_ia4- + (numtyp)1.40625*r6_ia6+ + (numtyp)0.2734375*r4_ia4*r4_ia4)); + if (rsq < lj1[mtype].y) { + energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)- + lj3[mtype].w; + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { + *ap1=energy; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii*/ +} + +#endif + diff --git a/lib/gpu/ellipsoid_nbor.cu b/lib/gpu/ellipsoid_nbor.cu new file mode 100644 index 0000000000..e91d356b4e --- /dev/null +++ b/lib/gpu/ellipsoid_nbor.cu @@ -0,0 +1,165 @@ +// ************************************************************************** +// ellipsoid_nbor.cu +// ------------------- +// W. Michael Brown +// +// Device code for Ellipsoid neighbor routines +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifndef ELLIPSOID_NBOR_H +#define ELLIPSOID_NBOR_H + +#ifdef NV_KERNEL + +#include "nv_kernel_def.h" + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define MAX_SHARED_TYPES 8 + +#endif + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#else +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#endif + +#define SBBITS 30 +#define NEIGHMASK 0x3FFFFFFF + +// --------------------------------------------------------------------------- +// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access +// -- Only unpack neighbors matching the specified inclusive range of forms +// -- Only unpack neighbors within cutoff +// --------------------------------------------------------------------------- +__kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, + const int ntypes, __global int *dev_nbor, + const int nbor_pitch, const int start, const int inum, + __global int *dev_ij, const int form_low, + const int form_high) { + + // ii indexes the two interacting particles in gi + int ii=GLOBAL_ID_X+start; + + if (ii=form_low && cf.y<=form_high) { + // Compute r12; + numtyp rsq=jx.x-ix.x; + rsq*=rsq; + numtyp t=jx.y-ix.y; + rsq+=t*t; + t=jx.z-ix.z; + rsq+=t*t; + + if (rsq=form_low && form[mtype]<=form_high) { + // Compute r12; + numtyp rsq=jx.x-ix.x; + rsq*=rsq; + numtyp t=jx.y-ix.y; + rsq+=t*t; + t=jx.z-ix.z; + rsq+=t*t; + + if (rsq> SBBITS & 3; } + +__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, + numtyp ans[9]) +{ + numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]- + m[2]*m[6]*m[4]+m[1]*m[6]*m[5]- + m[3]*m[1]*m[8]+m[0]*m[4]*m[8]; + den = (numtyp)1.0/den; + + ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]- + m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+ + m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]- + m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+ + m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den; + + ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+ + (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]- + (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]- + m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+ + m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den; + + ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]- + m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]- + m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+ + (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+ + m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den; + + ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+ + m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+ + m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]- + m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]- + m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den; + + ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+ + (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]- + (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+ + m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]- + m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den; + + ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]- + m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+ + (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+ + m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]- + (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den; + + ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+ + (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+ + m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]- + m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]- + m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den; + + ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]- + (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+ + (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]- + m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+ + m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den; + + ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]- + m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]- + m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+ + (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+ + m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; +} + +__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, + __global numtyp4* shape, __global numtyp4* well, + __global numtyp *gum, __global numtyp2* sig_eps, + const int ntypes, __global numtyp *lshape, + __global int *dev_nbor, const int stride, + __global acctyp4 *ans, const int astride, + __global acctyp *engv, __global int *err_flag, + const int eflag, const int vflag, const int inum, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + + __local numtyp sp_lj[4]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp4 tor; + tor.x=(acctyp)0; + tor.y=(acctyp)0; + tor.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) + energy+=u_r*temp2; + numtyp temp1 = -eta*u_r*factor_lj; + if (vflag>0) { + r12[0]*=-r; + r12[1]*=-r; + r12[2]*=-r; + numtyp ft=temp1*dchi[0]-temp2*dUr[0]; + f.x+=ft; + virial[0]+=r12[0]*ft; + ft=temp1*dchi[1]-temp2*dUr[1]; + f.y+=ft; + virial[1]+=r12[1]*ft; + virial[3]+=r12[0]*ft; + ft=temp1*dchi[2]-temp2*dUr[2]; + f.z+=ft; + virial[2]+=r12[2]*ft; + virial[4]+=r12[0]*ft; + virial[5]+=r12[1]*ft; + } else { + f.x+=temp1*dchi[0]-temp2*dUr[0]; + f.y+=temp1*dchi[1]-temp2*dUr[1]; + f.z+=temp1*dchi[2]-temp2*dUr[2]; + } + + // Torque on 1 + temp1 = -u_r*eta*factor_lj; + temp2 = -u_r*chi*factor_lj; + numtyp temp3 = -chi*eta*factor_lj; + tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; + tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; + tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[7][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=tor.x; + red_acc[4][tid]=tor.y; + red_acc[5][tid]=tor.z; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + tor.x=red_acc[3][tid]; + tor.y=red_acc[4][tid]; + tor.z=red_acc[5][tid]; + + if (eflag>0 || vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + red_acc[6][tid]=energy; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<7; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + energy=red_acc[6][tid]; + } + } + + // Store answers + if (ii0) { + *ap1=energy; + ap1+=astride; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=astride; + } + } + ans[ii]=f; + ans[ii+astride]=tor; + } // if ii +} + +#endif + diff --git a/lib/gpu/gayberne_lj.cu b/lib/gpu/gayberne_lj.cu new file mode 100644 index 0000000000..4bd3a1f82a --- /dev/null +++ b/lib/gpu/gayberne_lj.cu @@ -0,0 +1,594 @@ +// ************************************************************************** +// gayberne_lj.cu +// ------------------- +// W. Michael Brown +// +// Device code for Gay-Berne - Lennard-Jones potential acceleration +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifndef GAYBERNE_LJ_CU +#define GAYBERNE_LJ_CU + +#ifdef NV_KERNEL +#include "ellipsoid_extra.h" +#endif + +#define SBBITS 30 +#define NEIGHMASK 0x3FFFFFFF +__inline int sbmask(int j) { return j >> SBBITS & 3; } + +__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, + __global numtyp4* shape,__global numtyp4* well, + __global numtyp *gum, __global numtyp2* sig_eps, + const int ntypes, __global numtyp *lshape, + __global int *dev_nbor, const int stride, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag,const int start, const int inum, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; + + __local numtyp sp_lj[4]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) + energy+=u_r*temp2; + numtyp temp1 = -eta*u_r*factor_lj; + if (vflag>0) { + r12[0]*=-1; + r12[1]*=-1; + r12[2]*=-1; + numtyp ft=temp1*dchi[0]-temp2*dUr[0]; + f.x+=ft; + virial[0]+=r12[0]*ft; + ft=temp1*dchi[1]-temp2*dUr[1]; + f.y+=ft; + virial[1]+=r12[1]*ft; + virial[3]+=r12[0]*ft; + ft=temp1*dchi[2]-temp2*dUr[2]; + f.z+=ft; + virial[2]+=r12[2]*ft; + virial[4]+=r12[0]*ft; + virial[5]+=r12[1]*ft; + } else { + f.x+=temp1*dchi[0]-temp2*dUr[0]; + f.y+=temp1*dchi[1]-temp2*dUr[1]; + f.z+=temp1*dchi[2]-temp2*dUr[2]; + } + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { + *ap1=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii +} + +__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *gum, + const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int start, const int inum, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; + + __local numtyp sp_lj[4]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y); + energy+=factor_lj*(e-lj3[ii].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { + *ap1+=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1+=virial[i]; + ap1+=inum; + } + } + acctyp4 old=ans[ii]; + old.x+=f.x; + old.y+=f.y; + old.z+=f.z; + ans[ii]=old; + } // if ii +} + +__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, __global numtyp *gum, + const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int start, const int inum, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; + + __local numtyp sp_lj[4]; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + if (tid<4) + sp_lj[tid]=gum[tid+3]; + if (tid0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii0) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { + *ap1+=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1+=virial[i]; + ap1+=inum; + } + } + acctyp4 old=ans[ii]; + old.x+=f.x; + old.y+=f.y; + old.z+=f.z; + ans[ii]=old; + } // if ii +} + +#endif + diff --git a/lib/gpu/lj_class2_long.cu b/lib/gpu/lj_class2_long.cu new file mode 100644 index 0000000000..5fc7c408df --- /dev/null +++ b/lib/gpu/lj_class2_long.cu @@ -0,0 +1,472 @@ +// ************************************************************************** +// lj_class2_long.cu +// ------------------- +// W. Michael Brown +// +// Device code for COMPASS LJ long acceleration +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : Mon May 16 2011 +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifndef LJCL_GPU_KERNEL +#define LJCL_GPU_KERNEL + +#ifdef NV_KERNEL + +#include "nv_kernel_def.h" +texture pos_tex; +texture q_tex; + +#ifdef _DOUBLE_DOUBLE +__inline double4 fetch_pos(const int& i, const double4 *pos) +{ + return pos[i]; +} +__inline double fetch_q(const int& i, const double *q) +{ + return q[i]; +} +#else +__inline float4 fetch_pos(const int& i, const float4 *pos) +{ + return tex1Dfetch(pos_tex, i); +} +__inline float fetch_q(const int& i, const float *q) +{ + return tex1Dfetch(q_tex, i); +} +#endif + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define __inline inline + +#define fetch_pos(i,y) x_[i] +#define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 + +#endif + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#define EWALD_F (numtyp)1.12837917 +#define EWALD_P (numtyp)0.3275911 +#define A1 (numtyp)0.254829592 +#define A2 (numtyp)-0.284496736 +#define A3 (numtyp)1.421413741 +#define A4 (numtyp)-1.453152027 +#define A5 (numtyp)1.061405429 + +#define SBBITS 30 +#define NEIGHMASK 0x3FFFFFFF +__inline int sbmask(int j) { return j >> SBBITS & 3; } + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < lj1[mtype].w) { + numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { + *ap1=energy; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii0) { + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < lj1[mtype].w) { + numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { + *ap1=energy; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii*/ +} + +#endif + diff --git a/lib/gpu/re_squared.cu b/lib/gpu/re_squared.cu new file mode 100644 index 0000000000..d91a04f5a2 --- /dev/null +++ b/lib/gpu/re_squared.cu @@ -0,0 +1,526 @@ +// ************************************************************************** +// re_squared.cu +// ------------------- +// W. Michael Brown +// +// Device code for RE-Squared potential acceleration +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : Fri May 06 2011 +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifndef RE_SQUARED_CU +#define RE_SQUARED_CU + +#ifdef NV_KERNEL +#include "ellipsoid_extra.h" +#endif + +#define SBBITS 30 +#define NEIGHMASK 0x3FFFFFFF +__inline int sbmask(int j) { return j >> SBBITS & 3; } + +__inline numtyp det_prime(const numtyp m[9], const numtyp m2[9]) +{ + numtyp ans; + ans = m2[0]*m[4]*m[8] - m2[0]*m[5]*m[7] - + m[3]*m2[1]*m[8] + m[3]*m2[2]*m[7] + + m[6]*m2[1]*m[5] - m[6]*m2[2]*m[4] + + m[0]*m2[4]*m[8] - m[0]*m2[5]*m[7] - + m2[3]*m[1]*m[8] + m2[3]*m[2]*m[7] + + m[6]*m[1]*m2[5] - m[6]*m[2]*m2[4] + + m[0]*m[4]*m2[8] - m[0]*m[5]*m2[7] - + m[3]*m[1]*m2[8] + m[3]*m[2]*m2[7] + + m2[6]*m[1]*m[5] - m2[6]*m[2]*m[4]; + return ans; +} + +__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, + __global numtyp4* shape, __global numtyp4* well, + __global numtyp *splj, __global numtyp2* sig_eps, + const int ntypes, __global int *dev_nbor, + const int stride, __global acctyp4 *ans, + const int astride, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int inum, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + + __local numtyp sp_lj[4]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; + sp_lj[3]=splj[3]; + + __local numtyp b_alpha, cr60; + b_alpha=(numtyp)45.0/(numtyp)56.0; + cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp4 tor; + tor.x=(acctyp)0; + tor.y=(acctyp)0; + tor.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (iibody) + numtyp aTe1[9]; // A'*E + numtyp gamma1[9]; // A'*S^2*A + numtyp sa1[9]; // S^2*A; + numtyp lA1_0[9], lA1_1[9], lA1_2[9]; // -A*rotation generator (x,y, or z) + numtyp lAtwo1_0[9], lAtwo1_1[9], lAtwo1_2[9]; // A'*S^2*lA + numtyp lAsa1_0[9], lAsa1_1[9], lAsa1_2[9]; // lAtwo+lA'*sa + numtyp4 ishape; + + ishape=shape[itype]; + numtyp4 ishape2; + ishape2.x=ishape.x*ishape.x; + ishape2.y=ishape.y*ishape.y; + ishape2.z=ishape.z*ishape.z; + numtyp ilshape = ishape.x*ishape.y*ishape.z; + + { + numtyp aTs[9]; // A1'*S1^2 + gpu_quat_to_mat_trans(q,i,a1); + gpu_transpose_times_diag3(a1,well[itype],aTe1); + gpu_transpose_times_diag3(a1,ishape2,aTs); + gpu_diag_times3(ishape2,a1,sa1); + gpu_times3(aTs,a1,gamma1); + gpu_rotation_generator_x(a1,lA1_0); + gpu_rotation_generator_y(a1,lA1_1); + gpu_rotation_generator_z(a1,lA1_2); + gpu_times3(aTs,lA1_0,lAtwo1_0); + gpu_transpose_times3(lA1_0,sa1,lAsa1_0); + gpu_plus3(lAsa1_0,lAtwo1_0,lAsa1_0); + gpu_times3(aTs,lA1_1,lAtwo1_1); + gpu_transpose_times3(lA1_1,sa1,lAsa1_1); + gpu_plus3(lAsa1_1,lAtwo1_1,lAsa1_1); + gpu_times3(aTs,lA1_2,lAtwo1_2); + gpu_transpose_times3(lA1_2,sa1,lAsa1_2); + gpu_plus3(lAsa1_2,lAtwo1_2,lAsa1_2); + } + ishape2.x=(numtyp)1.0/ishape2.x; + ishape2.y=(numtyp)1.0/ishape2.y; + ishape2.z=(numtyp)1.0/ishape2.z; + + numtyp factor_lj; + for ( ; nborbody) + numtyp gamma2[9]; // A'*S^2*A + numtyp4 jshape; + + jshape=shape[jtype]; + numtyp4 jshape2; + jshape2.x=jshape.x*jshape.x; + jshape2.y=jshape.y*jshape.y; + jshape2.z=jshape.z*jshape.z; + { + numtyp aTs[9]; // A1'*S1^2 + gpu_quat_to_mat_trans(q,j,a2); + gpu_transpose_times_diag3(a2,jshape2,aTs); + gpu_times3(aTs,a2,gamma2); + } + + numtyp temp[9], s[3], z1[3], z2[3], v1[3], v2[3]; + numtyp sigma12, sigma1, sigma2; + gpu_plus3(gamma1,gamma2,temp); + gpu_mldivide3(temp,rhat,s,err_flag); + sigma12 = rsqrt((numtyp)0.5*gpu_dot3(s,rhat)); + gpu_times_column3(a1,rhat,z1); + gpu_times_column3(a2,rhat,z2); + v1[0] = z1[0]*ishape2.x; + v1[1] = z1[1]*ishape2.y; + v1[2] = z1[2]*ishape2.z; + v2[0] = z2[0]/jshape2.x; + v2[1] = z2[1]/jshape2.y; + v2[2] = z2[2]/jshape2.z; + sigma1 = sqrt(gpu_dot3(z1,v1)); + sigma2 = sqrt(gpu_dot3(z2,v2)); + + numtyp H12[9]; + numtyp dH; + H12[0] = gamma1[0]*sigma1+gamma2[0]*sigma2; + H12[1] = gamma1[1]*sigma1+gamma2[1]*sigma2; + H12[2] = gamma1[2]*sigma1+gamma2[2]*sigma2; + H12[3] = gamma1[3]*sigma1+gamma2[3]*sigma2; + H12[4] = gamma1[4]*sigma1+gamma2[4]*sigma2; + H12[5] = gamma1[5]*sigma1+gamma2[5]*sigma2; + H12[6] = gamma1[6]*sigma1+gamma2[6]*sigma2; + H12[7] = gamma1[7]*sigma1+gamma2[7]*sigma2; + H12[8] = gamma1[8]*sigma1+gamma2[8]*sigma2; + dH=gpu_det3(H12); + + numtyp sigma1p2, sigma2p2, lambda, nu; + sigma1p2 = sigma1*sigma1; + sigma2p2 = sigma2*sigma2; + numtyp jlshape = jshape.x*jshape.y*jshape.z; + lambda = ilshape*sigma1p2 + jlshape*sigma2p2; + + + sigma1=(numtyp)1.0/sigma1; + sigma2=(numtyp)1.0/sigma2; + + nu = sqrt(dH/(sigma1+sigma2)); + gpu_times3(aTe1,a1,temp); + + numtyp sigma, epsilon; + int mtype=mul24(ntypes,itype)+jtype; + sigma = sig_eps[mtype].x; + epsilon = sig_eps[mtype].y*factor_lj; + + numtyp w[3], temp2[9]; + numtyp h12,eta,chi,sprod,sigh,tprod; + numtyp aTe2[9]; // A'*E + gpu_transpose_times_diag3(a2,well[jtype],aTe2); + gpu_times3(aTe2,a2,temp2); + gpu_plus3(temp,temp2,temp); + gpu_mldivide3(temp,rhat,w,err_flag); + h12 = (numtyp)1.0/rnorm-sigma12; + eta = lambda/nu; + chi = (numtyp)2.0*gpu_dot3(rhat,w); + sprod = ilshape * jlshape; + sigh = sigma/h12; + tprod = eta*chi*sigh; + + numtyp stemp, Ua; + stemp = h12*(numtyp)0.5; + Ua = (ishape.x+stemp)*(ishape.y+stemp)* + (ishape.z+stemp)*(jshape.x+stemp)* + (jshape.y+stemp)*(jshape.z+stemp); + Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*sprod/Ua; + Ua = epsilon*Ua/(numtyp)-36.0; + + numtyp Ur; + stemp = h12/cr60; + Ur = (ishape.x+stemp)*(ishape.y+stemp)* + (ishape.z+stemp)*(jshape.x+stemp)* + (jshape.y+stemp)*(jshape.z+stemp); + Ur = ((numtyp)1.0+b_alpha*tprod)*sprod/Ur; + numtyp sigh6=sigh*sigh*sigh; + sigh6*=sigh6; + Ur = epsilon*Ur*sigh6/(numtyp)2025.0; + + energy+=Ua+Ur; + + // force + + numtyp vsigma1[3], vsigma2[3], gsigma1[9], gsigma2[9]; + numtyp sec, sigma12p3, sigma1p3, sigma2p3; + sec = sigma*eta*chi; + sigma12p3 = sigma12*sigma12*sigma12; + sigma1p3 = sigma1/sigma1p2; + sigma2p3 = sigma2/sigma2p2; + vsigma1[0] = -sigma1p3*v1[0]; + vsigma1[1] = -sigma1p3*v1[1]; + vsigma1[2] = -sigma1p3*v1[2]; + vsigma2[0] = -sigma2p3*v2[0]; + vsigma2[1] = -sigma2p3*v2[1]; + vsigma2[2] = -sigma2p3*v2[2]; + gsigma1[0] = -gamma1[0]*sigma1p2; + gsigma1[1] = -gamma1[1]*sigma1p2; + gsigma1[2] = -gamma1[2]*sigma1p2; + gsigma1[3] = -gamma1[3]*sigma1p2; + gsigma1[4] = -gamma1[4]*sigma1p2; + gsigma1[5] = -gamma1[5]*sigma1p2; + gsigma1[6] = -gamma1[6]*sigma1p2; + gsigma1[7] = -gamma1[7]*sigma1p2; + gsigma1[8] = -gamma1[8]*sigma1p2; + gsigma2[0] = -gamma2[0]*sigma2p2; + gsigma2[1] = -gamma2[1]*sigma2p2; + gsigma2[2] = -gamma2[2]*sigma2p2; + gsigma2[3] = -gamma2[3]*sigma2p2; + gsigma2[4] = -gamma2[4]*sigma2p2; + gsigma2[5] = -gamma2[5]*sigma2p2; + gsigma2[6] = -gamma2[6]*sigma2p2; + gsigma2[7] = -gamma2[7]*sigma2p2; + gsigma2[8] = -gamma2[8]*sigma2p2; + + numtyp tsig1sig2, tdH, teta1, teta2; + numtyp fourw[3], spr[3]; + tsig1sig2 = eta/((numtyp)2.0*(sigma1+sigma2)); + tdH = eta/((numtyp)2.0*dH); + teta1 = (numtyp)2.0*eta/lambda; + teta2 = teta1*jlshape/sigma2p3; + teta1 = teta1*ilshape/sigma1p3; + fourw[0] = (numtyp)4.0*w[0]; + fourw[1] = (numtyp)4.0*w[1]; + fourw[2] = (numtyp)4.0*w[2]; + spr[0] = (numtyp)0.5*sigma12p3*s[0]; + spr[1] = (numtyp)0.5*sigma12p3*s[1]; + spr[2] = (numtyp)0.5*sigma12p3*s[2]; + + numtyp hsec, dspu, pbsu; + stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+ + (numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+ + (numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+ + (numtyp)1.0/(jshape.x*(numtyp)2.0+h12)+ + (numtyp)1.0/(jshape.y*(numtyp)2.0+h12)+ + (numtyp)1.0/(jshape.z*(numtyp)2.0+h12); + hsec = (numtyp)1.0/(h12+(numtyp)3.0*sec); + dspu = (numtyp)1.0/h12-hsec+stemp; + pbsu = (numtyp)3.0*sigma*hsec; + + numtyp dspr, pbsr; + stemp = (numtyp)1.0/(ishape.x*cr60+h12)+ + (numtyp)1.0/(ishape.y*cr60+h12)+ + (numtyp)1.0/(ishape.z*cr60+h12)+ + (numtyp)1.0/(jshape.x*cr60+h12)+ + (numtyp)1.0/(jshape.y*cr60+h12)+ + (numtyp)1.0/(jshape.z*cr60+h12); + hsec = (numtyp)1.0/(h12+b_alpha*sec); + dspr = (numtyp)7.0/h12-hsec+stemp; + pbsr = b_alpha*sigma*hsec; + + numtyp dH12[9]; + numtyp dUa, dUr, deta, dchi, ddH, dh12; + numtyp dsigma1, dsigma2; + + #pragma unroll + for (int i=0; i<3; i++) { + numtyp u[3], u1[3], u2[3]; + u[0] = -rhat[i]*rhat[0]; + u[1] = -rhat[i]*rhat[1]; + u[2] = -rhat[i]*rhat[2]; + u[i] += (numtyp)1.0; + u[0] *= rnorm; + u[1] *= rnorm; + u[2] *= rnorm; + gpu_times_column3(a1,u,u1); + gpu_times_column3(a2,u,u2); + dsigma1=gpu_dot3(u1,vsigma1); + dsigma2=gpu_dot3(u2,vsigma2); + dH12[0] = dsigma1*gsigma1[0]+dsigma2*gsigma2[0]; + dH12[1] = dsigma1*gsigma1[1]+dsigma2*gsigma2[1]; + dH12[2] = dsigma1*gsigma1[2]+dsigma2*gsigma2[2]; + dH12[3] = dsigma1*gsigma1[3]+dsigma2*gsigma2[3]; + dH12[4] = dsigma1*gsigma1[4]+dsigma2*gsigma2[4]; + dH12[5] = dsigma1*gsigma1[5]+dsigma2*gsigma2[5]; + dH12[6] = dsigma1*gsigma1[6]+dsigma2*gsigma2[6]; + dH12[7] = dsigma1*gsigma1[7]+dsigma2*gsigma2[7]; + dH12[8] = dsigma1*gsigma1[8]+dsigma2*gsigma2[8]; + ddH = det_prime(H12,dH12); + deta = (dsigma1+dsigma2)*tsig1sig2; + deta -= ddH*tdH; + deta -= dsigma1*teta1+dsigma2*teta2; + dchi = gpu_dot3(u,fourw); + dh12 = rhat[i]+gpu_dot3(u,spr); + dUa = pbsu*(eta*dchi+deta*chi)-dh12*dspu; + dUr = pbsr*(eta*dchi+deta*chi)-dh12*dspr; + numtyp force=dUr*Ur+dUa*Ua; + if (i==0) { + f.x+=force; + if (vflag>0) + virial[0]+=-r[0]*force; + } else if (i==1) { + f.y+=force; + if (vflag>0) { + virial[1]+=-r[1]*force; + virial[3]+=-r[0]*force; + } + } else { + f.z+=force; + if (vflag>0) { + virial[2]+=-r[2]*force; + virial[4]+=-r[0]*force; + virial[5]+=-r[1]*force; + } + } + } + + // torque on i + sigma1=(numtyp)1.0/sigma1; + + numtyp fwae[3], p[3]; + gpu_row_times3(fourw,aTe1,fwae); + + { + gpu_times_column3(lA1_0,rhat,p); + dsigma1 = gpu_dot3(p,vsigma1); + dH12[0] = lAsa1_0[0]*sigma1+dsigma1*gsigma1[0]; + dH12[1] = lAsa1_0[1]*sigma1+dsigma1*gsigma1[1]; + dH12[2] = lAsa1_0[2]*sigma1+dsigma1*gsigma1[2]; + dH12[3] = lAsa1_0[3]*sigma1+dsigma1*gsigma1[3]; + dH12[4] = lAsa1_0[4]*sigma1+dsigma1*gsigma1[4]; + dH12[5] = lAsa1_0[5]*sigma1+dsigma1*gsigma1[5]; + dH12[6] = lAsa1_0[6]*sigma1+dsigma1*gsigma1[6]; + dH12[7] = lAsa1_0[7]*sigma1+dsigma1*gsigma1[7]; + dH12[8] = lAsa1_0[8]*sigma1+dsigma1*gsigma1[8]; + ddH = det_prime(H12,dH12); + deta = tsig1sig2*dsigma1-tdH*ddH; + deta -= teta1*dsigma1; + numtyp tempv[3]; + gpu_times_column3(lA1_0,w,tempv); + dchi = -gpu_dot3(fwae,tempv); + gpu_times_column3(lAtwo1_0,spr,tempv); + dh12 = -gpu_dot3(s,tempv); + + dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu; + dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr; + tor.x -= (dUa*Ua+dUr*Ur); + } + + { + gpu_times_column3(lA1_1,rhat,p); + dsigma1 = gpu_dot3(p,vsigma1); + dH12[0] = lAsa1_1[0]*sigma1+dsigma1*gsigma1[0]; + dH12[1] = lAsa1_1[1]*sigma1+dsigma1*gsigma1[1]; + dH12[2] = lAsa1_1[2]*sigma1+dsigma1*gsigma1[2]; + dH12[3] = lAsa1_1[3]*sigma1+dsigma1*gsigma1[3]; + dH12[4] = lAsa1_1[4]*sigma1+dsigma1*gsigma1[4]; + dH12[5] = lAsa1_1[5]*sigma1+dsigma1*gsigma1[5]; + dH12[6] = lAsa1_1[6]*sigma1+dsigma1*gsigma1[6]; + dH12[7] = lAsa1_1[7]*sigma1+dsigma1*gsigma1[7]; + dH12[8] = lAsa1_1[8]*sigma1+dsigma1*gsigma1[8]; + ddH = det_prime(H12,dH12); + deta = tsig1sig2*dsigma1-tdH*ddH; + deta -= teta1*dsigma1; + numtyp tempv[3]; + gpu_times_column3(lA1_1,w,tempv); + dchi = -gpu_dot3(fwae,tempv); + gpu_times_column3(lAtwo1_1,spr,tempv); + dh12 = -gpu_dot3(s,tempv); + + dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu; + dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr; + tor.y -= (dUa*Ua+dUr*Ur); + } + + { + gpu_times_column3(lA1_2,rhat,p); + dsigma1 = gpu_dot3(p,vsigma1); + dH12[0] = lAsa1_2[0]*sigma1+dsigma1*gsigma1[0]; + dH12[1] = lAsa1_2[1]*sigma1+dsigma1*gsigma1[1]; + dH12[2] = lAsa1_2[2]*sigma1+dsigma1*gsigma1[2]; + dH12[3] = lAsa1_2[3]*sigma1+dsigma1*gsigma1[3]; + dH12[4] = lAsa1_2[4]*sigma1+dsigma1*gsigma1[4]; + dH12[5] = lAsa1_2[5]*sigma1+dsigma1*gsigma1[5]; + dH12[6] = lAsa1_2[6]*sigma1+dsigma1*gsigma1[6]; + dH12[7] = lAsa1_2[7]*sigma1+dsigma1*gsigma1[7]; + dH12[8] = lAsa1_2[8]*sigma1+dsigma1*gsigma1[8]; + ddH = det_prime(H12,dH12); + deta = tsig1sig2*dsigma1-tdH*ddH; + deta -= teta1*dsigma1; + numtyp tempv[3]; + gpu_times_column3(lA1_2,w,tempv); + dchi = -gpu_dot3(fwae,tempv); + gpu_times_column3(lAtwo1_2,spr,tempv); + dh12 = -gpu_dot3(s,tempv); + + dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu; + dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr; + tor.z -= (dUa*Ua+dUr*Ur); + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[7][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=tor.x; + red_acc[4][tid]=tor.y; + red_acc[5][tid]=tor.z; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + tor.x=red_acc[3][tid]; + tor.y=red_acc[4][tid]; + tor.z=red_acc[5][tid]; + + if (eflag>0 || vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + red_acc[6][tid]=energy; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<7; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + energy=red_acc[6][tid]; + } + } + + // Store answers + if (ii0) { + *ap1=energy; + ap1+=astride; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=astride; + } + } + ans[ii]=f; + ans[ii+astride]=tor; + } // if ii + +} + +#endif + diff --git a/lib/gpu/re_squared_lj.cu b/lib/gpu/re_squared_lj.cu new file mode 100644 index 0000000000..784dbe63e7 --- /dev/null +++ b/lib/gpu/re_squared_lj.cu @@ -0,0 +1,892 @@ +// ************************************************************************** +// re_squared_lj.cu +// ------------------- +// W. Michael Brown +// +// Device code for RE-Squared - Lennard-Jones potential acceleration +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : Fri May 06 2011 +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifndef RE_SQUARED_LJ_CU +#define RE_SQUARED_LJ_CU + +#ifdef NV_KERNEL +#include "ellipsoid_extra.h" +#endif + +#define SBBITS 30 +#define NEIGHMASK 0x3FFFFFFF +__inline int sbmask(int j) { return j >> SBBITS & 3; } + + +__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q, + __global numtyp4* shape, __global numtyp4* well, + __global numtyp *splj, __global numtyp2* sig_eps, + const int ntypes, __global int *dev_nbor, const int stride, + __global acctyp4 *ans, const int astride, + __global acctyp *engv, __global int *err_flag, + const int eflag, const int vflag, const int inum, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + + __local numtyp sp_lj[4]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; + sp_lj[3]=splj[3]; + + __local numtyp b_alpha, cr60, solv_f_a, solv_f_r; + b_alpha=(numtyp)45.0/(numtyp)56.0; + cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0); + solv_f_a = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*-(numtyp)36.0); + solv_f_r = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*(numtyp)2025.0); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp4 tor; + tor.x=(acctyp)0; + tor.y=(acctyp)0; + tor.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (iibody) + numtyp aTe[9]; // A'*E + numtyp lA_0[9], lA_1[9], lA_2[9]; // -A*rotation generator (x,y, or z) + + numtyp4 ishape; + ishape=shape[itype]; + numtyp ilshape=ishape.x*ishape.y*ishape.z; + + { + gpu_quat_to_mat_trans(q,i,a); + gpu_transpose_times_diag3(a,well[itype],aTe); + gpu_rotation_generator_x(a,lA_0); + gpu_rotation_generator_y(a,lA_1); + gpu_rotation_generator_z(a,lA_2); + } + + numtyp factor_lj; + for ( ; nbor0) + virial[0]+=-r[0]*force; + } else if (i==1) { + f.y+=force; + if (vflag>0) { + virial[1]+=-r[1]*force; + virial[3]+=-r[0]*force; + } + } else { + f.z+=force; + if (vflag>0) { + virial[2]+=-r[2]*force; + virial[4]+=-r[0]*force; + virial[5]+=-r[1]*force; + } + } + + } + + // torque on i + numtyp fwae[3]; + gpu_row_times3(fourw,aTe,fwae); + { + numtyp tempv[3], p[3], lAtwo[9]; + gpu_times_column3(lA_0,rhat,p); + gpu_times_column3(lA_0,w,tempv); + numtyp dchi = -gpu_dot3(fwae,tempv); + gpu_times3(aTs,lA_0,lAtwo); + gpu_times_column3(lAtwo,spr,tempv); + numtyp dh12 = -gpu_dot3(s,tempv); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + tor.x -= (dUa*Ua+dUr*Ur); + } + + { + numtyp tempv[3], p[3], lAtwo[9]; + gpu_times_column3(lA_1,rhat,p); + gpu_times_column3(lA_1,w,tempv); + numtyp dchi = -gpu_dot3(fwae,tempv); + gpu_times3(aTs,lA_1,lAtwo); + gpu_times_column3(lAtwo,spr,tempv); + numtyp dh12 = -gpu_dot3(s,tempv); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + tor.y -= (dUa*Ua+dUr*Ur); + } + + { + numtyp tempv[3], p[3], lAtwo[9]; + gpu_times_column3(lA_2,rhat,p); + gpu_times_column3(lA_2,w,tempv); + numtyp dchi = -gpu_dot3(fwae,tempv); + gpu_times3(aTs,lA_2,lAtwo); + gpu_times_column3(lAtwo,spr,tempv); + numtyp dh12 = -gpu_dot3(s,tempv); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + tor.z -= (dUa*Ua+dUr*Ur); + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[7][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=tor.x; + red_acc[4][tid]=tor.y; + red_acc[5][tid]=tor.z; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + tor.x=red_acc[3][tid]; + tor.y=red_acc[4][tid]; + tor.z=red_acc[5][tid]; + + if (eflag>0 || vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + red_acc[6][tid]=energy; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<7; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + energy=red_acc[6][tid]; + } + } + + // Store answers + if (ii0) { + *ap1+=energy; + ap1+=astride; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1+=virial[i]; + ap1+=astride; + } + } + acctyp4 old=ans[ii]; + old.x+=f.x; + old.y+=f.y; + old.z+=f.z; + ans[ii]=old; + + old=ans[ii+astride]; + old.x+=tor.x; + old.y+=tor.y; + old.z+=tor.z; + ans[ii+astride]=old; + } // if ii + +} + +__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, + __global numtyp4* shape,__global numtyp4* well, + __global numtyp *splj, __global numtyp2* sig_eps, + const int ntypes, __global int *dev_nbor, + const int stride, __global acctyp4 *ans, + __global acctyp *engv, __global int *err_flag, + const int eflag, const int vflag,const int start, + const int inum, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; + + __local numtyp sp_lj[4]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; + sp_lj[3]=splj[3]; + + __local numtyp b_alpha, cr60, solv_f_a, solv_f_r; + b_alpha=(numtyp)45.0/(numtyp)56.0; + cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0); + solv_f_a = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*-(numtyp)36.0); + solv_f_r = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*(numtyp)2025.0); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (iibody) + numtyp aTe[9]; // A'*E + numtyp4 ishape; + + ishape=shape[itype]; + gpu_quat_to_mat_trans(q,i,a); + gpu_transpose_times_diag3(a,well[itype],aTe); + + // Compute r12 + numtyp r[3], rhat[3]; + numtyp rnorm; + r[0] = ix.x-jx.x; + r[1] = ix.y-jx.y; + r[2] = ix.z-jx.z; + rnorm = gpu_dot3(r,r); + rnorm = rsqrt(rnorm); + rhat[0] = r[0]*rnorm; + rhat[1] = r[1]*rnorm; + rhat[2] = r[2]*rnorm; + + numtyp sigma, epsilon; + int mtype=mul24(ntypes,itype)+jtype; + sigma = sig_eps[mtype].x; + epsilon = sig_eps[mtype].y*factor_lj; + + numtyp aTs[9]; + numtyp4 scorrect; + numtyp half_sigma=sigma * (numtyp)0.5; + scorrect.x = ishape.x+half_sigma; + scorrect.y = ishape.y+half_sigma; + scorrect.z = ishape.z+half_sigma; + scorrect.x = scorrect.x * scorrect.x * (numtyp)0.5; + scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5; + scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5; + gpu_transpose_times_diag3(a,scorrect,aTs); + + // energy + + numtyp gamma[9], s[3]; + gpu_times3(aTs,a,gamma); + gpu_mldivide3(gamma,rhat,s,err_flag); + + numtyp sigma12 = rsqrt((numtyp)0.5*gpu_dot3(s,rhat)); + numtyp temp[9], w[3]; + gpu_times3(aTe,a,temp); + temp[0] += (numtyp)1.0; + temp[4] += (numtyp)1.0; + temp[8] += (numtyp)1.0; + gpu_mldivide3(temp,rhat,w,err_flag); + + numtyp h12 = (numtyp)1.0/rnorm-sigma12; + numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w); + numtyp sigh = sigma/h12; + numtyp tprod = chi*sigh; + + numtyp Ua, Ur; + numtyp h12p3 = h12*h12*h12; + numtyp sigmap3 = sigma*sigma*sigma; + numtyp stemp = h12/(numtyp)2.0; + Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0; + numtyp ilshape=ishape.x*ishape.y*ishape.z; + Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua; + Ua = epsilon*Ua*sigmap3*solv_f_a; + + stemp = h12/cr60; + Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/ + (numtyp)60.0; + Ur = ((numtyp)1.0+b_alpha*tprod)*ilshape/Ur; + numtyp sigh6=sigh*sigh*sigh; + sigh6*=sigh6; + Ur = epsilon*Ur*sigmap3*sigh6*solv_f_r; + + energy+=Ua+Ur; + + // force + + numtyp fourw[3], spr[3]; + numtyp sec = sigma*chi; + numtyp sigma12p3 = sigma12*sigma12*sigma12; + fourw[0] = (numtyp)4.0*w[0]; + fourw[1] = (numtyp)4.0*w[1]; + fourw[2] = (numtyp)4.0*w[2]; + spr[0] = (numtyp)0.5*sigma12p3*s[0]; + spr[1] = (numtyp)0.5*sigma12p3*s[1]; + spr[2] = (numtyp)0.5*sigma12p3*s[2]; + + stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+ + (numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+ + (numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+ + (numtyp)3.0/h12; + numtyp hsec = (numtyp)1.0/(h12+(numtyp)3.0*sec); + numtyp dspu = (numtyp)1.0/h12-hsec+stemp; + numtyp pbsu = (numtyp)3.0*sigma*hsec; + + stemp = (numtyp)1.0/(ishape.x*cr60+h12)+ + (numtyp)1.0/(ishape.y*cr60+h12)+ + (numtyp)1.0/(ishape.z*cr60+h12)+ + (numtyp)3.0/h12; + hsec = (numtyp)1.0/(h12+b_alpha*sec); + numtyp dspr = (numtyp)7.0/h12-hsec+stemp; + numtyp pbsr = b_alpha*sigma*hsec; + + #pragma unroll + for (int i=0; i<3; i++) { + numtyp u[3]; + u[0] = -rhat[i]*rhat[0]; + u[1] = -rhat[i]*rhat[1]; + u[2] = -rhat[i]*rhat[2]; + u[i] += (numtyp)1.0; + u[0] *= rnorm; + u[1] *= rnorm; + u[2] *= rnorm; + numtyp dchi = gpu_dot3(u,fourw); + numtyp dh12 = rhat[i]+gpu_dot3(u,spr); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + numtyp force=dUr*Ur+dUa*Ua; + if (i==0) { + f.x+=force; + if (vflag>0) + virial[0]+=-r[0]*force; + } else if (i==1) { + f.y+=force; + if (vflag>0) { + virial[1]+=-r[1]*force; + virial[3]+=-r[0]*force; + } + } else { + f.z+=force; + if (vflag>0) { + virial[2]+=-r[2]*force; + virial[4]+=-r[0]*force; + virial[5]+=-r[1]*force; + } + } + + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[7][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<3; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + + if (eflag>0 || vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + red_acc[6][tid]=energy; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<7; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + energy=red_acc[6][tid]; + } + } + + // Store answers + if (ii0) { + *ap1=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii +} + +__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *gum, + const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int start, const int inum, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; + + __local numtyp sp_lj[4]; + sp_lj[0]=gum[0]; + sp_lj[1]=gum[1]; + sp_lj[2]=gum[2]; + sp_lj[3]=gum[3]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y); + energy+=factor_lj*(e-lj3[ii].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { + *ap1+=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1+=virial[i]; + ap1+=inum; + } + } + acctyp4 old=ans[ii]; + old.x+=f.x; + old.y+=f.y; + old.z+=f.z; + ans[ii]=old; + } // if ii +} + +__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, __global numtyp *gum, + const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int start, const int inum, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; + + __local numtyp sp_lj[4]; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + if (tid<4) + sp_lj[tid]=gum[tid]; + if (tid0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii0) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { + *ap1+=energy; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1+=virial[i]; + ap1+=inum; + } + } + acctyp4 old=ans[ii]; + old.x+=f.x; + old.y+=f.y; + old.z+=f.z; + ans[ii]=old; + } // if ii +} + +#endif + diff --git a/lib/gpu/replace_code.sh b/lib/gpu/replace_code.sh new file mode 100755 index 0000000000..504abeadd5 --- /dev/null +++ b/lib/gpu/replace_code.sh @@ -0,0 +1,13 @@ +#!/bin/tcsh + +set files = `echo *.h *.cpp *.cu` +rm -rf /tmp/cpp5678 +mkdir /tmp/cpp5678 +mkdir /tmp/cpp5678/lgpu + +foreach file ( $files ) +# /bin/cp $file /tmp/cpp5678/$file:t:t + # ------ Sed Replace + sed -i 's/atom->dev_engv/ans->dev_engv/g' $file +end +