git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@14867 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2016-04-22 17:51:49 +00:00
parent 129796adc2
commit 184d5dc0f0
12 changed files with 168 additions and 171 deletions
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@ -71,9 +71,6 @@ int CoulLongT::init(const int ntypes, double **host_scale,
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;

-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  
  scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale);
  
@ -88,8 +85,7 @@ int CoulLongT::init(const int ntypes, double **host_scale,
  _g_ewald=g_ewald;

  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+scale.row_bytes()+
-    sp_cl.row_bytes();
+  this->_max_bytes=scale.row_bytes()+sp_cl.row_bytes();
  return 0;
 }

@ -106,8 +102,6 @@ void CoulLongT::clear() {
    return;
  _allocated=false;

-  lj1.clear();
-  lj3.clear();
  scale.clear();
  sp_cl.clear();
  this->clear_atomic();
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -124,8 +124,7 @@ texture<int2> q_tex;
 #endif

 __kernel void k_coul_long(const __global numtyp4 *restrict x_, 
-                          const __global numtyp4 *restrict lj1,
-                          const __global numtyp4 *restrict lj3, 
+                          const __global numtyp *restrict scale,
                          const int lj_types,
                          const __global numtyp *restrict sp_cl_in, 
                          const __global int *dev_nbor,
@ -161,6 +160,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
    numtyp qtmp; fetch(qtmp,i,q_tex);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
@ -171,6 +171,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;

      // Compute r12
      numtyp delx = ix.x-jx.x;
@ -178,6 +179,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

+      int mtype=itype*lj_types+jtype;
      if (rsq < cut_coulsq) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp force, prefactor, _erfc;
@ -188,7 +190,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
        _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
        fetch(prefactor,j,q_tex);
-        prefactor *= qqrd2e * qtmp/r;
+        prefactor *= qqrd2e * scale[mtype] * qtmp/r;
        force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;

        f.x+=delx*force;
@ -215,8 +217,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
 }

 __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, 
-                               const __global numtyp4 *restrict lj1_in,
-                               const __global numtyp4 *restrict lj3_in,
+                               const __global numtyp *restrict scale_in,
                               const __global numtyp *restrict sp_cl_in,
                               const __global int *dev_nbor, 
                               const __global int *dev_packed,
@ -230,9 +231,12 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

+  __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_cl[4];
  if (tid<4)
    sp_cl[tid]=sp_cl_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
+    scale[tid]=scale_in[tid];

  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -252,6 +256,8 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
      int j=dev_packed[nbor];
@ -261,6 +267,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;

      // Compute r12
      numtyp delx = ix.x-jx.x;
@ -278,7 +285,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
        _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
        fetch(prefactor,j,q_tex);
-        prefactor *= qqrd2e * qtmp/r;
+        prefactor *= qqrd2e * scale[mtype] * qtmp/r;
        force = prefactor*(_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;

        f.x+=delx*force;
--- a/lib/gpu/lal_coul_long.h
+++ b/lib/gpu/lal_coul_long.h
@ -59,10 +59,6 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {

  // --------------------------- TYPE DATA --------------------------

-  /// lj1 dummy
-  UCL_D_Vec<numtyp4> lj1;
-  /// lj3 dummy
-  UCL_D_Vec<numtyp4> lj3;
  /// scale
  UCL_D_Vec<numtyp> scale;
  /// Special Coul values [0-3]
--- a/lib/meam/Makefile.mingw32-cross
+++ b/lib/meam/Makefile.mingw32-cross
@ -23,7 +23,7 @@ FILES = $(SRC) Makefile

 DIR = Obj_mingw32/
 LIB = $(DIR)libmeam.a
-OBJ = $(SRC:%.F=$(DIR)%.o)
+OBJ = $(SRC:%.F=$(DIR)%.o) $(DIR)fm_exp.o

 # ------ SETTINGS ------

--- a/lib/meam/Makefile.mingw64-cross
+++ b/lib/meam/Makefile.mingw64-cross
@ -23,7 +23,7 @@ FILES = $(SRC) Makefile

 DIR = Obj_mingw64/
 LIB = $(DIR)libmeam.a
-OBJ = $(SRC:%.F=$(DIR)%.o)
+OBJ = $(SRC:%.F=$(DIR)%.o) $(DIR)fm_exp.o

 # ------ SETTINGS ------