diff --git a/lib/gpu/lal_dpd_charged.cpp b/lib/gpu/lal_dpd_charged.cpp
index f07d870884..f7956ee69b 100644
--- a/lib/gpu/lal_dpd_charged.cpp
+++ b/lib/gpu/lal_dpd_charged.cpp
@@ -46,14 +46,15 @@ template <class numtyp, class acctyp>
 int DPDChargedT::init(const int ntypes,
                double **host_cutsq, double **host_a0,
                double **host_gamma, double **host_sigma,
-               double **host_cut,
-               double **host_cut_dpd, double **host_cut_dpdsq, double **host_cut_slatersq,
+               double **host_cut_dpd, double **host_cut_dpdsq, 
+               double **host_cut_slatersq, **host_scale,
                double *host_special_lj,
                const bool tstat_only,
                const int nlocal, const int nall,
                const int max_nbors, const int maxspecial,
                const double cell_size,
-               const double gpu_split, FILE *_screen) {
+               const double gpu_split, FILE *_screen, double *host_special_coul,
+               const double qqrd2e, const double g_ewald, double lamda) {
   const int max_shared_types=this->device->max_shared_types();
 
   int onetype=0;
@@ -83,7 +84,28 @@ int DPDChargedT::init(const int ntypes,
     lj_types=max_shared_types;
     shared_types=true;
   }
+  
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write_coul(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write_coul[i]=0.0;
+
+  scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack1(ntypes,lj_types,scale,host_write_coul,host_scale);
+
+  sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write_coul[i]=host_special_coul[i];
+  }
+  ucl_copy(sp_cl,host_write_coul,4,false);
+
   _lj_types=lj_types;
+  _cut_coulsq=host_cut_coulsq;
+  _qqrd2e=qqrd2e;
+  _g_ewald=g_ewald;
+  _lamda=lamda;
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
@@ -103,7 +125,7 @@ int DPDChargedT::init(const int ntypes,
     host_rsq[i]=0.0;
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,cutsq,host_rsq,host_cutsq, 
-                          host_cut_dpdsq, host_cut_dpd, host_cut_slatersq);
+                          host_cut_dpdsq, host_scale, host_cut_slatersq);
 
   double special_sqrt[4];
   special_sqrt[0] = sqrt(host_special_lj[0]);
@@ -181,20 +203,23 @@ int DPDChargedT::loop(const int eflag, const int vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_sel->set_size(GX,BX);
-    this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &sp_lj, &sp_sqrt,
+    this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &sp_lj, &sp_cl, &sp_sqrt,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->v, &cutsq,
                           &this->_dtinvsqrt, &this->_seed, &this->_timestep,
-                          &this->_tstat_only, &this->_threads_per_atom);
+                          &_qqrd2e, &_g_ewald, &_lamda,
+                          &this->_tstat_only, &this->_threads_per_atom,);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &_lj_types, &sp_lj, &sp_sqrt,
+    this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &_lj_types, &sp_lj, &sp_cl, &sp_sqrt,
                      &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt,
-                     &this->_seed, &this->_timestep, &this->_tstat_only,
-                     &this->_threads_per_atom);
+                     &_qqrd2e, &_g_ewald, &_lamda,
+                     &this->_seed, &this->_timestep,
+                     &_qqrd2e, &_g_ewald, &_lamda,
+                     &this->_tstat_only, &this->_threads_per_atom,);
   }
   this->time_pair.stop();
   return GX;
diff --git a/lib/gpu/lal_dpd_charged.cu b/lib/gpu/lal_dpd_charged.cu
index 1fe2bfcc17..2ffd2b9990 100644
--- a/lib/gpu/lal_dpd_charged.cu
+++ b/lib/gpu/lal_dpd_charged.cu
@@ -166,6 +166,7 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
                     const __global numtyp4 *restrict coeff,
                     const int lj_types,
                     const __global numtyp *restrict sp_lj,
+                    const __global numtyp *restrict sp_cl_in,
                     const __global numtyp *restrict sp_sqrt,
                     const __global int * dev_nbor,
                     const __global int * dev_packed,
@@ -176,19 +177,31 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
                     const __global numtyp4 *restrict v_,
                     const __global numtyp4 *restrict cutsq,
                     const numtyp dtinvsqrt, const int seed,
-                    const int timestep, const int tstat_only,
+                    const int timestep, const numtyp qqrd2e, 
+                    const numtyp g_ewald, const numtyp lamda,
+                    const int tstat_only,
                     const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
+  __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
+  sp_cl[0]=sp_cl_in[0];
+  sp_cl[1]=sp_cl_in[1];
+  sp_cl[2]=sp_cl_in[2];
+  sp_cl[3]=sp_cl_in[3];
+
   int n_stride;
   local_allocate_store_pair();
 
   acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp energy, virial[6];
+  acctyp e_coul, energy, virial[6];
   if (EVFLAG) {
     energy=(acctyp)0;
+    e_coul=(acctyp)0
     for (int i=0; i<6; i++) virial[i]=(acctyp)0;
   }
 
@@ -202,7 +215,8 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
     numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i];
     int itag=iv.w;
 
-    const numtyp qtmp = extra[i].x; // q[i]
+    numtyp qtmp = extra[i].x; // q[i]
+    numtyp lamdainv = ucl_recip(lamda);
 
     numtyp factor_dpd, factor_sqrt;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
@@ -211,6 +225,8 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
       int j=dev_packed[nbor];
       factor_dpd = sp_lj[sbmask(j)];
       factor_sqrt = sp_sqrt[sbmask(j)];
+      numtyp factor_coul;
+      factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -225,48 +241,88 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
       int mtype=itype*lj_types+jtype;
-      if (rsq<cutsq.x[mtype]) {
+
+      // cutsq[mtype].x -> global squared cutoff
+      if (rsq<cutsq[mtype].x) {
         numtyp r=ucl_sqrt(rsq);
-        if (r < EPSILON) continue;
+        numtyp force_dpd = (numtyp)0.0;
+        numtyp force_coul = (numtyp)0.0;
 
-        numtyp rinv=ucl_recip(r);
-        numtyp delvx = iv.x - jv.x;
-        numtyp delvy = iv.y - jv.y;
-        numtyp delvz = iv.z - jv.z;
-        numtyp dot = delx*delvx + dely*delvy + delz*delvz;
-        numtyp wd = (numtyp)1.0 - r/coeff[mtype].w;
+        // apply DPD force if distance below DPD cutoff
+        // cutsq[mtype].y -> DPD squared cutoff
+        if (rsq < cutsq[mtype].y && r < EPSILON) {
 
-        const numtyp qj = extra[j].x;
+          numtyp rinv=ucl_recip(r);
+          numtyp delvx = iv.x - jv.x;
+          numtyp delvy = iv.y - jv.y;
+          numtyp delvz = iv.z - jv.z;
+          numtyp dot = delx*delvx + dely*delvy + delz*delvz;
+          numtyp wd = (numtyp)1.0 - r/coeff[mtype].w;
 
-        unsigned int tag1=itag, tag2=jtag;
-        if (tag1 > tag2) {
-          tag1 = jtag; tag2 = itag;
+          const numtyp qj = extra[j].x;
+
+          unsigned int tag1=itag, tag2=jtag;
+          if (tag1 > tag2) {
+            tag1 = jtag; tag2 = itag;
+          }
+
+          numtyp randnum = (numtyp)0.0;
+          saru(tag1, tag2, seed, timestep, randnum);
+
+          // conservative force = a0 * wd, or 0 if tstat only
+          // drag force = -gamma * wd^2 * (delx dot delv) / r
+          // random force = sigma * wd * rnd * dtinvsqrt;
+
+          
+          if (!tstat_only) force_dpd = coeff[mtype].x*wd;
+          force_dpd -= coeff[mtype].y*wd*wd*dot*rinv;
+          force_dpd *= factor_dpd;
+          force_dpd += factor_sqrt*coeff[mtype].z*wd*randnum*dtinvsqrt;
+          force_dpd *=rinv;
+        }
+      
+        // apply Slater electrostatic force if distance below Slater cutoff 
+        // and the two species have a slater coeff
+        // cutsq[mtype].w -> Coulombic squared cutoff
+        if ( cutsq[mtype].w != 0.0 && rsq < cutsq[mtype].w){
+          numtyp r2inv=ucl_recip(rsq);
+          numtyp _erfc;
+
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          numtyp prefactor = extra[j].x;
+          prefactor *= qqrd2e * cutsq[mtype].z * qtmp/r;
+          numtyp rlamdainv = r * lamdainv;
+          numtyp exprlmdainv = ucl_exp((numtyp)-2.0*rlamdainv);
+          numtyp slater_term = exprlmdainv*((numtyp)1.0 + ((numtyp)2.0*rlamdainv*((numtyp)1.0+rlamdainv)));
+          force_coul = prefactor*(_erfc + EWALD_F*grij*expm2-slater_term);
+          if (factor_coul > (numtyp)0) force_coul -= factor_coul*prefactor*((numtyp)1.0-slater_term);
+          force_coul *= r2inv;
         }
 
-        numtyp randnum = (numtyp)0.0;
-        saru(tag1, tag2, seed, timestep, randnum);
-
-        // conservative force = a0 * wd, or 0 if tstat only
-        // drag force = -gamma * wd^2 * (delx dot delv) / r
-        // random force = sigma * wd * rnd * dtinvsqrt;
-
-        numtyp force = (numtyp)0.0;
-        if (!tstat_only) force = coeff[mtype].x*wd;
-        force -= coeff[mtype].y*wd*wd*dot*rinv;
-        force *= factor_dpd;
-        force += factor_sqrt*coeff[mtype].z*wd*randnum*dtinvsqrt;
-        force*=rinv;
-
+        numtyp force = force_coul + force_dpd;
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (EVFLAG && eflag) {
-          // unshifted eng of conservative term:
-          // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
-          // eng shifted to 0.0 at cutoff
-          numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
-          energy+=factor_dpd*e;
+
+          if (rsq < cutsq[mtype].y && r < EPSILON) {
+            // unshifted eng of conservative term:
+            // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
+            // eng shifted to 0.0 at cutoff
+            numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
+            energy+=factor_dpd*e;
+          }
+          if ( cutsq[mtype].w != 0.0 && rsq < cutsq[mtype].w){
+            numtyp e_slater = ((numtyp)1.0 + rlamdainv)*exprlmdainv;
+            numtyp e = prefactor*(_erfc-e_slater);
+            if (factor_coul > (numtyp)0) e -= factor_coul*prefactor*((numtyp)1.0 - e_slater);
+            e_coul += e;
+          }
         }
         if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
@@ -276,18 +332,22 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
+
+
       }
 
     } // for nbor
   } // if ii
-  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+  store_answers_q(f,energy, e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                 ans,engv);
+
 }
 
 __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict extra,
                          const __global numtyp4 *restrict coeff_in,
                          const __global numtyp *restrict sp_lj_in,
+                         const __global numtyp *restrict sp_cl_in,
                          const __global numtyp *restrict sp_sqrt_in,
                          const __global int * dev_nbor,
                          const __global int * dev_packed,
@@ -298,39 +358,41 @@ __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict v_,
                          const __global numtyp4 *restrict cutsq,
                          const numtyp dtinvsqrt, const int seed,
-                         const int timestep, const int tstat_only,
+                         const int timestep, const numtyp qqrd2e, 
+                         const numtyp g_ewald, const numtyp lamda,
+                         const int tstat_only,
                          const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  #ifndef ONETYPE
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   __local numtyp sp_sqrt[4];
+  /// COUL Init
+  __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_cl[4];
   if (tid<4) {
     sp_lj[tid]=sp_lj_in[tid];
     sp_sqrt[tid]=sp_sqrt_in[tid];
+    sp_cl[tid]=sp_cl_in[tid];
   }
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
+    scale[tid]=scale_in[tid];
   }
+
   __syncthreads();
-  #else
-  const numtyp coeffx=coeff_in[ONETYPE].x;
-  const numtyp coeffy=coeff_in[ONETYPE].y;
-  const numtyp coeffz=coeff_in[ONETYPE].z;
-  const numtyp coeffw=coeff_in[ONETYPE].w;
-  const numtyp cutsq_p=cutsq[ONETYPE];
-  #endif
+  
 
   int n_stride;
   local_allocate_store_pair();
 
   acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp energy, virial[6];
+  acctyp e_coul, energy, virial[6];
   if (EVFLAG) {
     energy=(acctyp)0;
+    e_coul=(acctyp)0;
     for (int i=0; i<6; i++) virial[i]=(acctyp)0;
   }
 
@@ -340,33 +402,26 @@ __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    #ifndef ONETYPE
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-    #endif
     numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i];
     int itag=iv.w;
 
-    const numtyp qi = extra[i].x;
+    numtyp qtmp = extra[i].x; // q[i]
+    numtyp lamdainv = ucl_recip(lamda);
 
-    #ifndef ONETYPE
     numtyp factor_dpd, factor_sqrt;
-    #endif
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
-      #ifndef ONETYPE
       factor_dpd = sp_lj[sbmask(j)];
       factor_sqrt = sp_sqrt[sbmask(j)];
+      numtyp factor_coul;
+      factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
       j &= NEIGHMASK;
-      #endif
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      #ifndef ONETYPE
-      int mtype=itype+jx.w;
-      const numtyp cutsq_p=cutsq[mtype];
-      #endif
       numtyp4 jv; fetch4(jv,j,vel_tex); //v_[j];
       int jtag=jv.w;
 
@@ -376,64 +431,89 @@ __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutsq_p) {
+      int mtype=itype+jx.w;
+      
+      // cutsq[mtype].x -> global squared cutoff
+      if (rsq<cutsq[mtype].x) {
         numtyp r=ucl_sqrt(rsq);
-        if (r < EPSILON) continue;
+        numtyp force_dpd = (numtyp)0.0;
+        numtyp force_coul = (numtyp)0.0;
 
-        numtyp rinv=ucl_recip(r);
-        numtyp delvx = iv.x - jv.x;
-        numtyp delvy = iv.y - jv.y;
-        numtyp delvz = iv.z - jv.z;
-        numtyp dot = delx*delvx + dely*delvy + delz*delvz;
-        #ifndef ONETYPE
-        const numtyp coeffw=coeff[mtype].w;
-        #endif
-        numtyp wd = (numtyp)1.0 - r/coeffw;
+        // apply DPD force if distance below DPD cutoff
+        // cutsq[mtype].y -> DPD squared cutoff
+        if (rsq < cutsq[mtype].y && r < EPSILON) {
 
-        const numtyp qj = extra[j].x;
-        
-        unsigned int tag1=itag, tag2=jtag;
-        if (tag1 > tag2) {
-          tag1 = jtag; tag2 = itag;
+          numtyp rinv=ucl_recip(r);
+          numtyp delvx = iv.x - jv.x;
+          numtyp delvy = iv.y - jv.y;
+          numtyp delvz = iv.z - jv.z;
+          numtyp dot = delx*delvx + dely*delvy + delz*delvz;
+          numtyp wd = (numtyp)1.0 - r/coeff[mtype].w;
+
+          const numtyp qj = extra[j].x;
+
+          unsigned int tag1=itag, tag2=jtag;
+          if (tag1 > tag2) {
+            tag1 = jtag; tag2 = itag;
+          }
+
+          numtyp randnum = (numtyp)0.0;
+          saru(tag1, tag2, seed, timestep, randnum);
+
+          // conservative force = a0 * wd, or 0 if tstat only
+          // drag force = -gamma * wd^2 * (delx dot delv) / r
+          // random force = sigma * wd * rnd * dtinvsqrt;
+
+          
+          if (!tstat_only) force_dpd = coeff[mtype].x*wd;
+          force_dpd -= coeff[mtype].y*wd*wd*dot*rinv;
+          force_dpd *= factor_dpd;
+          force_dpd += factor_sqrt*coeff[mtype].z*wd*randnum*dtinvsqrt;
+          force_dpd *=rinv;
+        }
+      
+        // apply Slater electrostatic force if distance below Slater cutoff 
+        // and the two species have a slater coeff
+        // cutsq[mtype].w -> Coulombic squared cutoff
+        if ( cutsq[mtype].w != 0.0 && rsq < cutsq[mtype].w){
+          numtyp r2inv=ucl_recip(rsq);
+          numtyp _erfc;
+
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          numtyp prefactor = extra[j].x;
+          prefactor *= qqrd2e * cutsq[mtype].z * qtmp/r;
+          numtyp rlamdainv = r * lamdainv;
+          numtyp exprlmdainv = ucl_exp((numtyp)-2.0*rlamdainv);
+          numtyp slater_term = exprlmdainv*((numtyp)1.0 + ((numtyp)2.0*rlamdainv*((numtyp)1.0+rlamdainv)));
+          force_coul = prefactor*(_erfc + EWALD_F*grij*expm2-slater_term);
+          if (factor_coul > (numtyp)0) force_coul -= factor_coul*prefactor*((numtyp)1.0-slater_term);
+          force_coul *= r2inv;
         }
 
-        numtyp randnum = (numtyp)0.0;
-        saru(tag1, tag2, seed, timestep, randnum);
-
-        // conservative force = a0 * wd, or 0 if tstat only
-        // drag force = -gamma * wd^2 * (delx dot delv) / r
-        // random force = sigma * wd * rnd * dtinvsqrt;
-
-        #ifndef ONETYPE
-        const numtyp coeffx=coeff[mtype].x;
-        const numtyp coeffy=coeff[mtype].y;
-        const numtyp coeffz=coeff[mtype].z;
-        #endif
-        numtyp force = (numtyp)0.0;
-        if (!tstat_only) force = coeffx*wd;
-        force -= coeffy*wd*wd*dot*rinv;
-        #ifndef ONETYPE
-        force *= factor_dpd;
-        force += factor_sqrt*coeffz*wd*randnum*dtinvsqrt;
-        #else
-        force += coeffz*wd*randnum*dtinvsqrt;
-        #endif
-        force*=rinv;
-
+        numtyp force = force_coul + force_dpd;
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (EVFLAG && eflag) {
-          // unshifted eng of conservative term:
-          // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
-          // eng shifted to 0.0 at cutoff
-          numtyp e = (numtyp)0.5*coeffx*coeffw * wd*wd;
-          #ifndef ONETYPE
-          energy+=factor_dpd*e;
-          #else
-          energy+=e;
-          #endif
+
+          if (rsq < cutsq[mtype].y && r < EPSILON) {
+            // unshifted eng of conservative term:
+            // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
+            // eng shifted to 0.0 at cutoff
+            numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
+            energy+=factor_dpd*e;
+          }
+          if ( cutsq[mtype].w != 0.0 && rsq < cutsq[mtype].w){
+            numtyp e_slater = ((numtyp)1.0 + rlamdainv)*exprlmdainv;
+            numtyp e = prefactor*(_erfc-e_slater);
+            if (factor_coul > (numtyp)0) e -= factor_coul*prefactor*((numtyp)1.0 - e_slater);
+            e_coul += e;
+          }
         }
         if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
@@ -443,6 +523,8 @@ __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
+
+
       }
 
     } // for nbor
diff --git a/lib/gpu/lal_dpd_charged.h b/lib/gpu/lal_dpd_charged.h
index fe64bb56c9..3f2da9f047 100644
--- a/lib/gpu/lal_dpd_charged.h
+++ b/lib/gpu/lal_dpd_charged.h
@@ -38,11 +38,12 @@ class DPDCharged : public BaseDPD<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq, double **host_a0,
-           double **host_gamma, double **host_sigma, double **host_cut,
+           double **host_gamma, double **host_sigma,
            double **host_cut_dpd, double **host_cut_dpdsq, double **host_cut_slatersq,
-           double *host_special_lj, bool tstat_only, const int nlocal,
+           double **host_scale, double *host_special_lj, bool tstat_only, const int nlocal,
            const int nall, const int max_nbors, const int maxspecial,
-           const double cell_size, const double gpu_split, FILE *screen);
+           const double cell_size, const double gpu_split, FILE *screen,
+           const double qqrd2e, const double g_ewald, const double lamda);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
@@ -63,12 +64,15 @@ class DPDCharged : public BaseDPD<numtyp, acctyp> {
   /// coeff.x = a0, coeff.y = gamma, coeff.z = sigma, coeff.w = cut_dpd
   UCL_D_Vec<numtyp4> coeff;
 
-  /// cutsq.x = cutsq, cutsq.y = cut_dpd_sq, cutsq.z = cut_dpd, cutsq.w = cut_slatersq
+  /// cutsq.x = cutsq, cutsq.y = cut_dpd_sq, cutsq.z = scale, cutsq.w = cut_slatersq
   UCL_D_Vec<numtyp4> cutsq;
 
   /// Special LJ values
   UCL_D_Vec<numtyp> sp_lj, sp_sqrt;
 
+  /// Special Coul values [0-3]
+  UCL_D_Vec<numtyp> sp_cl;
+
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
@@ -78,6 +82,9 @@ class DPDCharged : public BaseDPD<numtyp, acctyp> {
   /// Only used for thermostat
   int _tstat_only;
 
+  /// Coulombic terms
+  numtyp _cut_coulsq, _qqrd2e, _g_ewald, _lamda;
+
   /// pointer to host data for atom charge
   double *q;
 
diff --git a/lib/gpu/lal_dpd_charged_ext.cpp b/lib/gpu/lal_dpd_charged_ext.cpp
index 5a94aa6bf6..f05c093a7b 100644
--- a/lib/gpu/lal_dpd_charged_ext.cpp
+++ b/lib/gpu/lal_dpd_charged_ext.cpp
@@ -28,11 +28,12 @@ static DPDCharged<PRECISION,ACC_PRECISION> DPDCMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0,
-                 double **host_gamma, double **host_sigma, double **host_cut,
-                 double **host_cut_dpd, double **host_cut_slater,
-                 double *special_lj, const int inum,
+                 double **host_gamma, double **host_sigma,
+                 double **host_cut_dpd, double **host_cut_dpd_sq, double **host_cut_slatersq,
+                 double **host_scale, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen) {
+                 const double cell_size, int &gpu_mode, FILE *screen, double *host_special_coul,
+                 const double qqrd2e, const double g_ewald, const double lamda) {
   DPDCMF.clear();
   gpu_mode=DPDCMF.device->gpu_mode();
   double gpu_split=DPDCMF.device->particle_split();
@@ -56,8 +57,10 @@ int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0,
   int init_ok=0;
   if (world_me==0)
     init_ok=DPDCMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
-                       host_cut, host_cut_dpd, host_cut_dpdsq, host_cut_slatersq, special_lj, false, inum, nall, max_nbors,
-                       maxspecial, cell_size, gpu_split, screen);
+                       host_cut_dpd, host_cut_dpdsq, host_cut_slatersq,
+                       host_scale, special_lj, false, inum, nall, max_nbors,
+                       maxspecial, cell_size, gpu_split, screen,
+                       force->special_coul,->qqrd2e, g_ewald, lamda);
 
   DPDCMF.device->world_barrier();
   if (message)
@@ -74,8 +77,10 @@ int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=DPDCMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
-                         host_cut, host_cut_dpd, host_cut_slater, special_lj, false, inum, nall, max_nbors,
-                         maxspecial, cell_size, gpu_split, screen);
+                         host_cut_dpd, host_cut_dpdsq, host_cut_slatersq,
+                         host_scale, special_lj, false, inum, nall, max_nbors,
+                         maxspecial, cell_size, gpu_split, screen, 
+                         force->special_coul,force->qqrd2e, g_ewald, lamda);
 
     DPDCMF.device->serialize_init();
     if (message)
diff --git a/src/GPU/pair_dpd_charged_gpu.cpp b/src/GPU/pair_dpd_charged_gpu.cpp
index d92be2e7d2..2d05a4a33e 100644
--- a/src/GPU/pair_dpd_charged_gpu.cpp
+++ b/src/GPU/pair_dpd_charged_gpu.cpp
@@ -39,10 +39,11 @@ using namespace EwaldConst;
 // External functions from cuda library for atom decomposition
 
 int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0, double **host_gamma,
-                 double **host_sigma, double **host_cut, double **host_cut_dpd, double **host_cut_slater, double *special_lj, const int inum,
+                 double **host_sigma, double **host_cut_dpd, double **host_cut_dpd_sq, double **host_cut_slatersq,
+                 double **host_scale, double *special_lj, const int inum,
                  const int nall, const int max_nbors, const int maxspecial, const double cell_size,
-                 int &gpu_mode, FILE *screen, 
-                 double *host_special_coul, const double qqrd2e, const double g_ewald, const double lamda);
+                 int &gpu_mode, FILE *screen, double *host_special_coul,
+                 const double qqrd2e, const double g_ewald, const double lamda);
 void dpd_charged_gpu_clear();
 int **dpd_charged_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x,
                         int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial,
@@ -313,8 +314,9 @@ void PairDPDChargedGPU::init_style()
   if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial;
   int mnf = 5e-2 * neighbor->oneatom;
   int success =
-      dpd_charged_gpu_init(atom->ntypes + 1, cutsq, a0, gamma, sigma, cut, 
-                   cut_dpd, cut_dpdsq, cut_slatersq, force->special_lj, atom->nlocal,
+      dpd_charged_gpu_init(atom->ntypes + 1, cutsq, a0, gamma, sigma,
+                   cut_dpd, cut_dpdsq, cut_slatersq, scale,
+                   force->special_lj, atom->nlocal,
                    atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen,
                    force->special_coul, force->qqrd2e, g_ewald, lamda);
   GPU_EXTRA::check_flag(success, error, world);