cuda forces + init var

2024-06-05 17:49:27 +02:00
parent 9ea57acf54
commit eb7f947a0c
5 changed files with 251 additions and 130 deletions
--- a/lib/gpu/lal_dpd_charged.cpp
+++ b/lib/gpu/lal_dpd_charged.cpp
@ -46,14 +46,15 @@ template <class numtyp, class acctyp>
 int DPDChargedT::init(const int ntypes,
               double **host_cutsq, double **host_a0,
               double **host_gamma, double **host_sigma,
-               double **host_cut,
+               double **host_cut_dpd, double **host_cut_dpdsq, 
-               double **host_cut_dpd, double **host_cut_dpdsq, double **host_cut_slatersq,
+               double **host_cut_slatersq, **host_scale,
               double *host_special_lj,
               const bool tstat_only,
               const int nlocal, const int nall,
               const int max_nbors, const int maxspecial,
               const double cell_size,
-               const double gpu_split, FILE *_screen) {
+               const double gpu_split, FILE *_screen, double *host_special_coul,
               const double qqrd2e, const double g_ewald, double lamda) {
  const int max_shared_types=this->device->max_shared_types();
  int onetype=0;
@ -83,7 +84,28 @@ int DPDChargedT::init(const int ntypes,
    lj_types=max_shared_types;
    shared_types=true;
  }
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write_coul(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_ONLY);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write_coul[i]=0.0;
  scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack1(ntypes,lj_types,scale,host_write_coul,host_scale);
  sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write_coul[i]=host_special_coul[i];
  }
  ucl_copy(sp_cl,host_write_coul,4,false);
  _lj_types=lj_types;
  _cut_coulsq=host_cut_coulsq;
  _qqrd2e=qqrd2e;
  _g_ewald=g_ewald;
  _lamda=lamda;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
@ -103,7 +125,7 @@ int DPDChargedT::init(const int ntypes,
    host_rsq[i]=0.0;
  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,cutsq,host_rsq,host_cutsq, 
-                          host_cut_dpdsq, host_cut_dpd, host_cut_slatersq);
+                          host_cut_dpdsq, host_scale, host_cut_slatersq);
  double special_sqrt[4];
  special_sqrt[0] = sqrt(host_special_lj[0]);
@ -181,20 +203,23 @@ int DPDChargedT::loop(const int eflag, const int vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_sel->set_size(GX,BX);
-    this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &sp_lj, &sp_sqrt,
+    this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &sp_lj, &sp_cl, &sp_sqrt,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag,
                          &vflag, &ainum, &nbor_pitch, &this->atom->v, &cutsq,
                          &this->_dtinvsqrt, &this->_seed, &this->_timestep,
-                          &this->_tstat_only, &this->_threads_per_atom);
+                          &_qqrd2e, &_g_ewald, &_lamda,
                          &this->_tstat_only, &this->_threads_per_atom,);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &_lj_types, &sp_lj, &sp_sqrt,
+    this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &_lj_types, &sp_lj, &sp_cl, &sp_sqrt,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt,
-                     &this->_seed, &this->_timestep, &this->_tstat_only,
+                     &_qqrd2e, &_g_ewald, &_lamda,
-                     &this->_threads_per_atom);
+                     &this->_seed, &this->_timestep,
                     &_qqrd2e, &_g_ewald, &_lamda,
                     &this->_tstat_only, &this->_threads_per_atom,);
  }
  this->time_pair.stop();
  return GX;
--- a/lib/gpu/lal_dpd_charged.cu
+++ b/lib/gpu/lal_dpd_charged.cu
@ -166,6 +166,7 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
                    const __global numtyp4 *restrict coeff,
                    const int lj_types,
                    const __global numtyp *restrict sp_lj,
                    const __global numtyp *restrict sp_cl_in,
                    const __global numtyp *restrict sp_sqrt,
                    const __global int * dev_nbor,
                    const __global int * dev_packed,
@ -176,19 +177,31 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
                    const __global numtyp4 *restrict v_,
                    const __global numtyp4 *restrict cutsq,
                    const numtyp dtinvsqrt, const int seed,
-                    const int timestep, const int tstat_only,
+                    const int timestep, const numtyp qqrd2e, 
                    const numtyp g_ewald, const numtyp lamda,
                    const int tstat_only,
                    const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_cl[4];
  int n_stride;
  local_allocate_store_charge();
  sp_cl[0]=sp_cl_in[0];
  sp_cl[1]=sp_cl_in[1];
  sp_cl[2]=sp_cl_in[2];
  sp_cl[3]=sp_cl_in[3];
  int n_stride;
  local_allocate_store_pair();
  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp energy, virial[6];
+  acctyp e_coul, energy, virial[6];
  if (EVFLAG) {
    energy=(acctyp)0;
    e_coul=(acctyp)0
    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
  }
@ -202,7 +215,8 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
    numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i];
    int itag=iv.w;
-    const numtyp qtmp = extra[i].x; // q[i]
+    numtyp qtmp = extra[i].x; // q[i]
    numtyp lamdainv = ucl_recip(lamda);
    numtyp factor_dpd, factor_sqrt;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
@ -211,6 +225,8 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
      int j=dev_packed[nbor];
      factor_dpd = sp_lj[sbmask(j)];
      factor_sqrt = sp_sqrt[sbmask(j)];
      numtyp factor_coul;
      factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -225,48 +241,88 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      int mtype=itype*lj_types+jtype;
-      if (rsq<cutsq.x[mtype]) {
+
      // cutsq[mtype].x -> global squared cutoff
      if (rsq<cutsq[mtype].x) {
        numtyp r=ucl_sqrt(rsq);
-        if (r < EPSILON) continue;
+        numtyp force_dpd = (numtyp)0.0;
        numtyp force_coul = (numtyp)0.0;
-        numtyp rinv=ucl_recip(r);
+        // apply DPD force if distance below DPD cutoff
-        numtyp delvx = iv.x - jv.x;
+        // cutsq[mtype].y -> DPD squared cutoff
-        numtyp delvy = iv.y - jv.y;
+        if (rsq < cutsq[mtype].y && r < EPSILON) {
        numtyp delvz = iv.z - jv.z;
        numtyp dot = delx*delvx + dely*delvy + delz*delvz;
        numtyp wd = (numtyp)1.0 - r/coeff[mtype].w;
-        const numtyp qj = extra[j].x;
+          numtyp rinv=ucl_recip(r);
          numtyp delvx = iv.x - jv.x;
          numtyp delvy = iv.y - jv.y;
          numtyp delvz = iv.z - jv.z;
          numtyp dot = delx*delvx + dely*delvy + delz*delvz;
          numtyp wd = (numtyp)1.0 - r/coeff[mtype].w;
-        unsigned int tag1=itag, tag2=jtag;
+          const numtyp qj = extra[j].x;
-        if (tag1 > tag2) {
+
-          tag1 = jtag; tag2 = itag;
+          unsigned int tag1=itag, tag2=jtag;
          if (tag1 > tag2) {
            tag1 = jtag; tag2 = itag;
          }
          numtyp randnum = (numtyp)0.0;
          saru(tag1, tag2, seed, timestep, randnum);
          // conservative force = a0 * wd, or 0 if tstat only
          // drag force = -gamma * wd^2 * (delx dot delv) / r
          // random force = sigma * wd * rnd * dtinvsqrt;
          if (!tstat_only) force_dpd = coeff[mtype].x*wd;
          force_dpd -= coeff[mtype].y*wd*wd*dot*rinv;
          force_dpd *= factor_dpd;
          force_dpd += factor_sqrt*coeff[mtype].z*wd*randnum*dtinvsqrt;
          force_dpd *=rinv;
        }
        // apply Slater electrostatic force if distance below Slater cutoff 
        // and the two species have a slater coeff
        // cutsq[mtype].w -> Coulombic squared cutoff
        if ( cutsq[mtype].w != 0.0 && rsq < cutsq[mtype].w){
          numtyp r2inv=ucl_recip(rsq);
          numtyp _erfc;
          numtyp r = ucl_rsqrt(r2inv);
          numtyp grij = g_ewald * r;
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          numtyp prefactor = extra[j].x;
          prefactor *= qqrd2e * cutsq[mtype].z * qtmp/r;
          numtyp rlamdainv = r * lamdainv;
          numtyp exprlmdainv = ucl_exp((numtyp)-2.0*rlamdainv);
          numtyp slater_term = exprlmdainv*((numtyp)1.0 + ((numtyp)2.0*rlamdainv*((numtyp)1.0+rlamdainv)));
          force_coul = prefactor*(_erfc + EWALD_F*grij*expm2-slater_term);
          if (factor_coul > (numtyp)0) force_coul -= factor_coul*prefactor*((numtyp)1.0-slater_term);
          force_coul *= r2inv;
        }
-        numtyp randnum = (numtyp)0.0;
+        numtyp force = force_coul + force_dpd;
        saru(tag1, tag2, seed, timestep, randnum);
        // conservative force = a0 * wd, or 0 if tstat only
        // drag force = -gamma * wd^2 * (delx dot delv) / r
        // random force = sigma * wd * rnd * dtinvsqrt;
        numtyp force = (numtyp)0.0;
        if (!tstat_only) force = coeff[mtype].x*wd;
        force -= coeff[mtype].y*wd*wd*dot*rinv;
        force *= factor_dpd;
        force += factor_sqrt*coeff[mtype].z*wd*randnum*dtinvsqrt;
        force*=rinv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (EVFLAG && eflag) {
-          // unshifted eng of conservative term:
+
-          // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
+          if (rsq < cutsq[mtype].y && r < EPSILON) {
-          // eng shifted to 0.0 at cutoff
+            // unshifted eng of conservative term:
-          numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
+            // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
-          energy+=factor_dpd*e;
+            // eng shifted to 0.0 at cutoff
            numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
            energy+=factor_dpd*e;
          }
          if ( cutsq[mtype].w != 0.0 && rsq < cutsq[mtype].w){
            numtyp e_slater = ((numtyp)1.0 + rlamdainv)*exprlmdainv;
            numtyp e = prefactor*(_erfc-e_slater);
            if (factor_coul > (numtyp)0) e -= factor_coul*prefactor*((numtyp)1.0 - e_slater);
            e_coul += e;
          }
        }
        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
@ -276,18 +332,22 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
  } // if ii
-  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+  store_answers_q(f,energy, e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                ans,engv);
 }
 __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
                         const __global numtyp4 *restrict extra,
                         const __global numtyp4 *restrict coeff_in,
                         const __global numtyp *restrict sp_lj_in,
                         const __global numtyp *restrict sp_cl_in,
                         const __global numtyp *restrict sp_sqrt_in,
                         const __global int * dev_nbor,
                         const __global int * dev_packed,
@ -298,39 +358,41 @@ __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
                         const __global numtyp4 *restrict v_,
                         const __global numtyp4 *restrict cutsq,
                         const numtyp dtinvsqrt, const int seed,
-                         const int timestep, const int tstat_only,
+                         const int timestep, const numtyp qqrd2e, 
                         const numtyp g_ewald, const numtyp lamda,
                         const int tstat_only,
                         const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  #ifndef ONETYPE
  __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
  __local numtyp sp_sqrt[4];
  /// COUL Init
  __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_cl[4];
  if (tid<4) {
    sp_lj[tid]=sp_lj_in[tid];
    sp_sqrt[tid]=sp_sqrt_in[tid];
    sp_cl[tid]=sp_cl_in[tid];
  }
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff[tid]=coeff_in[tid];
    scale[tid]=scale_in[tid];
  }
  __syncthreads();
-  #else
+  
  const numtyp coeffx=coeff_in[ONETYPE].x;
  const numtyp coeffy=coeff_in[ONETYPE].y;
  const numtyp coeffz=coeff_in[ONETYPE].z;
  const numtyp coeffw=coeff_in[ONETYPE].w;
  const numtyp cutsq_p=cutsq[ONETYPE];
  #endif
  int n_stride;
  local_allocate_store_pair();
  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp energy, virial[6];
+  acctyp e_coul, energy, virial[6];
  if (EVFLAG) {
    energy=(acctyp)0;
    e_coul=(acctyp)0;
    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
  }
@ -340,33 +402,26 @@ __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
              n_stride,nbor_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    #ifndef ONETYPE
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    #endif
    numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i];
    int itag=iv.w;
-    const numtyp qi = extra[i].x;
+    numtyp qtmp = extra[i].x; // q[i]
    numtyp lamdainv = ucl_recip(lamda);
    #ifndef ONETYPE
    numtyp factor_dpd, factor_sqrt;
    #endif
    for ( ; nbor<nbor_end; nbor+=n_stride) {
      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      #ifndef ONETYPE
      factor_dpd = sp_lj[sbmask(j)];
      factor_sqrt = sp_sqrt[sbmask(j)];
      numtyp factor_coul;
      factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
      j &= NEIGHMASK;
      #endif
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      #ifndef ONETYPE
      int mtype=itype+jx.w;
      const numtyp cutsq_p=cutsq[mtype];
      #endif
      numtyp4 jv; fetch4(jv,j,vel_tex); //v_[j];
      int jtag=jv.w;
@ -376,64 +431,89 @@ __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
-      if (rsq<cutsq_p) {
+      int mtype=itype+jx.w;
      // cutsq[mtype].x -> global squared cutoff
      if (rsq<cutsq[mtype].x) {
        numtyp r=ucl_sqrt(rsq);
-        if (r < EPSILON) continue;
+        numtyp force_dpd = (numtyp)0.0;
        numtyp force_coul = (numtyp)0.0;
-        numtyp rinv=ucl_recip(r);
+        // apply DPD force if distance below DPD cutoff
-        numtyp delvx = iv.x - jv.x;
+        // cutsq[mtype].y -> DPD squared cutoff
-        numtyp delvy = iv.y - jv.y;
+        if (rsq < cutsq[mtype].y && r < EPSILON) {
        numtyp delvz = iv.z - jv.z;
        numtyp dot = delx*delvx + dely*delvy + delz*delvz;
        #ifndef ONETYPE
        const numtyp coeffw=coeff[mtype].w;
        #endif
        numtyp wd = (numtyp)1.0 - r/coeffw;
-        const numtyp qj = extra[j].x;
+          numtyp rinv=ucl_recip(r);
-        
+          numtyp delvx = iv.x - jv.x;
-        unsigned int tag1=itag, tag2=jtag;
+          numtyp delvy = iv.y - jv.y;
-        if (tag1 > tag2) {
+          numtyp delvz = iv.z - jv.z;
-          tag1 = jtag; tag2 = itag;
+          numtyp dot = delx*delvx + dely*delvy + delz*delvz;
          numtyp wd = (numtyp)1.0 - r/coeff[mtype].w;
          const numtyp qj = extra[j].x;
          unsigned int tag1=itag, tag2=jtag;
          if (tag1 > tag2) {
            tag1 = jtag; tag2 = itag;
          }
          numtyp randnum = (numtyp)0.0;
          saru(tag1, tag2, seed, timestep, randnum);
          // conservative force = a0 * wd, or 0 if tstat only
          // drag force = -gamma * wd^2 * (delx dot delv) / r
          // random force = sigma * wd * rnd * dtinvsqrt;
          if (!tstat_only) force_dpd = coeff[mtype].x*wd;
          force_dpd -= coeff[mtype].y*wd*wd*dot*rinv;
          force_dpd *= factor_dpd;
          force_dpd += factor_sqrt*coeff[mtype].z*wd*randnum*dtinvsqrt;
          force_dpd *=rinv;
        }
        // apply Slater electrostatic force if distance below Slater cutoff 
        // and the two species have a slater coeff
        // cutsq[mtype].w -> Coulombic squared cutoff
        if ( cutsq[mtype].w != 0.0 && rsq < cutsq[mtype].w){
          numtyp r2inv=ucl_recip(rsq);
          numtyp _erfc;
          numtyp r = ucl_rsqrt(r2inv);
          numtyp grij = g_ewald * r;
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          numtyp prefactor = extra[j].x;
          prefactor *= qqrd2e * cutsq[mtype].z * qtmp/r;
          numtyp rlamdainv = r * lamdainv;
          numtyp exprlmdainv = ucl_exp((numtyp)-2.0*rlamdainv);
          numtyp slater_term = exprlmdainv*((numtyp)1.0 + ((numtyp)2.0*rlamdainv*((numtyp)1.0+rlamdainv)));
          force_coul = prefactor*(_erfc + EWALD_F*grij*expm2-slater_term);
          if (factor_coul > (numtyp)0) force_coul -= factor_coul*prefactor*((numtyp)1.0-slater_term);
          force_coul *= r2inv;
        }
-        numtyp randnum = (numtyp)0.0;
+        numtyp force = force_coul + force_dpd;
        saru(tag1, tag2, seed, timestep, randnum);
        // conservative force = a0 * wd, or 0 if tstat only
        // drag force = -gamma * wd^2 * (delx dot delv) / r
        // random force = sigma * wd * rnd * dtinvsqrt;
        #ifndef ONETYPE
        const numtyp coeffx=coeff[mtype].x;
        const numtyp coeffy=coeff[mtype].y;
        const numtyp coeffz=coeff[mtype].z;
        #endif
        numtyp force = (numtyp)0.0;
        if (!tstat_only) force = coeffx*wd;
        force -= coeffy*wd*wd*dot*rinv;
        #ifndef ONETYPE
        force *= factor_dpd;
        force += factor_sqrt*coeffz*wd*randnum*dtinvsqrt;
        #else
        force += coeffz*wd*randnum*dtinvsqrt;
        #endif
        force*=rinv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (EVFLAG && eflag) {
-          // unshifted eng of conservative term:
+
-          // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
+          if (rsq < cutsq[mtype].y && r < EPSILON) {
-          // eng shifted to 0.0 at cutoff
+            // unshifted eng of conservative term:
-          numtyp e = (numtyp)0.5*coeffx*coeffw * wd*wd;
+            // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
-          #ifndef ONETYPE
+            // eng shifted to 0.0 at cutoff
-          energy+=factor_dpd*e;
+            numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
-          #else
+            energy+=factor_dpd*e;
-          energy+=e;
+          }
-          #endif
+          if ( cutsq[mtype].w != 0.0 && rsq < cutsq[mtype].w){
            numtyp e_slater = ((numtyp)1.0 + rlamdainv)*exprlmdainv;
            numtyp e = prefactor*(_erfc-e_slater);
            if (factor_coul > (numtyp)0) e -= factor_coul*prefactor*((numtyp)1.0 - e_slater);
            e_coul += e;
          }
        }
        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
@ -443,6 +523,8 @@ __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
--- a/lib/gpu/lal_dpd_charged.h
+++ b/lib/gpu/lal_dpd_charged.h
@ -38,11 +38,12 @@ class DPDCharged : public BaseDPD<numtyp, acctyp> {
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq, double **host_a0,
-           double **host_gamma, double **host_sigma, double **host_cut,
+           double **host_gamma, double **host_sigma,
           double **host_cut_dpd, double **host_cut_dpdsq, double **host_cut_slatersq,
-           double *host_special_lj, bool tstat_only, const int nlocal,
+           double **host_scale, double *host_special_lj, bool tstat_only, const int nlocal,
           const int nall, const int max_nbors, const int maxspecial,
-           const double cell_size, const double gpu_split, FILE *screen);
+           const double cell_size, const double gpu_split, FILE *screen,
           const double qqrd2e, const double g_ewald, const double lamda);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
@ -63,12 +64,15 @@ class DPDCharged : public BaseDPD<numtyp, acctyp> {
  /// coeff.x = a0, coeff.y = gamma, coeff.z = sigma, coeff.w = cut_dpd
  UCL_D_Vec<numtyp4> coeff;
-  /// cutsq.x = cutsq, cutsq.y = cut_dpd_sq, cutsq.z = cut_dpd, cutsq.w = cut_slatersq
+  /// cutsq.x = cutsq, cutsq.y = cut_dpd_sq, cutsq.z = scale, cutsq.w = cut_slatersq
  UCL_D_Vec<numtyp4> cutsq;
  /// Special LJ values
  UCL_D_Vec<numtyp> sp_lj, sp_sqrt;
  /// Special Coul values [0-3]
  UCL_D_Vec<numtyp> sp_cl;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
@ -78,6 +82,9 @@ class DPDCharged : public BaseDPD<numtyp, acctyp> {
  /// Only used for thermostat
  int _tstat_only;
  /// Coulombic terms
  numtyp _cut_coulsq, _qqrd2e, _g_ewald, _lamda;
  /// pointer to host data for atom charge
  double *q;
--- a/lib/gpu/lal_dpd_charged_ext.cpp
+++ b/lib/gpu/lal_dpd_charged_ext.cpp
@ -28,11 +28,12 @@ static DPDCharged<PRECISION,ACC_PRECISION> DPDCMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0,
-                 double **host_gamma, double **host_sigma, double **host_cut,
+                 double **host_gamma, double **host_sigma,
-                 double **host_cut_dpd, double **host_cut_slater,
+                 double **host_cut_dpd, double **host_cut_dpd_sq, double **host_cut_slatersq,
-                 double *special_lj, const int inum,
+                 double **host_scale, double *special_lj, const int inum,
                 const int nall, const int max_nbors,  const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen) {
+                 const double cell_size, int &gpu_mode, FILE *screen, double *host_special_coul,
                 const double qqrd2e, const double g_ewald, const double lamda) {
  DPDCMF.clear();
  gpu_mode=DPDCMF.device->gpu_mode();
  double gpu_split=DPDCMF.device->particle_split();
@ -56,8 +57,10 @@ int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0,
  int init_ok=0;
  if (world_me==0)
    init_ok=DPDCMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
-                       host_cut, host_cut_dpd, host_cut_dpdsq, host_cut_slatersq, special_lj, false, inum, nall, max_nbors,
+                       host_cut_dpd, host_cut_dpdsq, host_cut_slatersq,
-                       maxspecial, cell_size, gpu_split, screen);
+                       host_scale, special_lj, false, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen,
                       force->special_coul,->qqrd2e, g_ewald, lamda);
  DPDCMF.device->world_barrier();
  if (message)
@ -74,8 +77,10 @@ int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0,
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=DPDCMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
-                         host_cut, host_cut_dpd, host_cut_slater, special_lj, false, inum, nall, max_nbors,
+                         host_cut_dpd, host_cut_dpdsq, host_cut_slatersq,
-                         maxspecial, cell_size, gpu_split, screen);
+                         host_scale, special_lj, false, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen, 
                         force->special_coul,force->qqrd2e, g_ewald, lamda);
    DPDCMF.device->serialize_init();
    if (message)
--- a/src/GPU/pair_dpd_charged_gpu.cpp
+++ b/src/GPU/pair_dpd_charged_gpu.cpp
@ -39,10 +39,11 @@ using namespace EwaldConst;
 // External functions from cuda library for atom decomposition
 int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0, double **host_gamma,
-                 double **host_sigma, double **host_cut, double **host_cut_dpd, double **host_cut_slater, double *special_lj, const int inum,
+                 double **host_sigma, double **host_cut_dpd, double **host_cut_dpd_sq, double **host_cut_slatersq,
                 double **host_scale, double *special_lj, const int inum,
                 const int nall, const int max_nbors, const int maxspecial, const double cell_size,
-                 int &gpu_mode, FILE *screen, 
+                 int &gpu_mode, FILE *screen, double *host_special_coul,
-                 double *host_special_coul, const double qqrd2e, const double g_ewald, const double lamda);
+                 const double qqrd2e, const double g_ewald, const double lamda);
 void dpd_charged_gpu_clear();
 int **dpd_charged_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x,
                        int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial,
@ -313,8 +314,9 @@ void PairDPDChargedGPU::init_style()
  if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial;
  int mnf = 5e-2 * neighbor->oneatom;
  int success =
-      dpd_charged_gpu_init(atom->ntypes + 1, cutsq, a0, gamma, sigma, cut, 
+      dpd_charged_gpu_init(atom->ntypes + 1, cutsq, a0, gamma, sigma,
-                   cut_dpd, cut_dpdsq, cut_slatersq, force->special_lj, atom->nlocal,
+                   cut_dpd, cut_dpdsq, cut_slatersq, scale,
                   force->special_lj, atom->nlocal,
                   atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen,
                   force->special_coul, force->qqrd2e, g_ewald, lamda);
  GPU_EXTRA::check_flag(success, error, world);