Fixed bugs with gauss/gpu in bonded systems, including factor_lj in forces and energies

2023-02-09 23:55:14 -06:00
parent b635465154
commit 88ccd546d8
2 changed files with 19 additions and 12 deletions
--- a/lib/gpu/lal_gauss.cpp
+++ b/lib/gpu/lal_gauss.cpp
@ -133,13 +133,13 @@ int GaussT::loop(const int eflag, const int vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_sel->set_size(GX,BX);
-    this->k_pair_sel->run(&this->atom->x, &gauss1,
+    this->k_pair_sel->run(&this->atom->x, &gauss1, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &gauss1, &_lj_types,
+    this->k_pair.run(&this->atom->x, &gauss1, &_lj_types, &sp_lj,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->_threads_per_atom);
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@ -27,6 +27,7 @@ _texture_2d( pos_tex,int4);
 __kernel void k_gauss(const __global numtyp4 *restrict x_,
                      const __global numtyp4 *restrict gauss1,
                      const int lj_types,
+                      const __global numtyp *restrict sp_lj,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
                      __global acctyp4 *restrict ans,
@ -56,9 +57,11 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

+    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int j=dev_packed[nbor];
+      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -72,10 +75,9 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,

      int mtype=itype*lj_types+jtype;
      if (rsq<gauss1[mtype].z) {
-        numtyp r2inv = ucl_recip(rsq);
-        numtyp r = ucl_sqrt(rsq);
-        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq*
-        ucl_exp(-gauss1[mtype].y*rsq)*r2inv; //*factor_lj;
+        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*
+          ucl_exp(-gauss1[mtype].y*rsq);
+        force*=factor_lj;

        f.x+=delx*force;
        f.y+=dely*force;
@ -84,7 +86,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
        if (EVFLAG && eflag) {
          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
            gauss1[mtype].w);
-          energy+=e; //factor_lj*e;
+          energy+=factor_lj*e;
        }
        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
@ -104,6 +106,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,

 __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict gauss1_in,
+                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
@ -114,6 +117,9 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  __local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
  int n_stride;
  local_allocate_store_pair();

@ -141,9 +147,11 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

+    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int j=dev_packed[nbor];
+      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -156,10 +164,9 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
      numtyp rsq = delx*delx+dely*dely+delz*delz;

      if (rsq<gauss1[mtype].z) {
-        numtyp r2inv = ucl_recip(rsq);
-        numtyp r = ucl_sqrt(rsq);
-        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq*
-        ucl_exp(-gauss1[mtype].y*rsq)*r2inv; //*factor_lj;
+        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*
+          ucl_exp(-gauss1[mtype].y*rsq);
+        force*=factor_lj;

        f.x+=delx*force;
        f.y+=dely*force;
@ -168,7 +175,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
        if (EVFLAG && eflag) {
          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
            gauss1[mtype].w);
-          energy+=e; //factor_lj*e;
+          energy+=factor_lj*e;
        }
        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;