diff --git a/lib/gpu/lal_dpd.cpp b/lib/gpu/lal_dpd.cpp
index f890fb53a3..1f5209852b 100644
--- a/lib/gpu/lal_dpd.cpp
+++ b/lib/gpu/lal_dpd.cpp
@@ -99,16 +99,25 @@ int DPDT::init(const int ntypes,
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq);
 
+  double special_sqrt[4];
+  special_sqrt[0] = sqrt(host_special_lj[0]);
+  special_sqrt[1] = sqrt(host_special_lj[1]);
+  special_sqrt[2] = sqrt(host_special_lj[2]);
+  special_sqrt[3] = sqrt(host_special_lj[3]);
+
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   dview.view(host_special_lj,4,*(this->ucl_device));
   ucl_copy(sp_lj,dview,false);
+  sp_sqrt.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(special_sqrt,4,*(this->ucl_device));
+  ucl_copy(sp_sqrt,dview,false);
 
   _tstat_only = 0;
   if (tstat_only) _tstat_only=1;
 
   _allocated=true;
-  this->_max_bytes=coeff.row_bytes()+cutsq.row_bytes()+sp_lj.row_bytes();
+  this->_max_bytes=coeff.row_bytes()+cutsq.row_bytes()+sp_lj.row_bytes()+sp_sqrt.row_bytes();
   return 0;
 }
 
@@ -121,6 +130,7 @@ void DPDT::clear() {
   coeff.clear();
   cutsq.clear();
   sp_lj.clear();
+  sp_sqrt.clear();
   this->clear_atomic();
 }
 
@@ -144,7 +154,7 @@ int DPDT::loop(const int eflag, const int vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_sel->set_size(GX,BX);
-    this->k_pair_sel->run(&this->atom->x, &coeff, &sp_lj,
+    this->k_pair_sel->run(&this->atom->x, &coeff, &sp_lj, &sp_sqrt,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->v, &cutsq,
@@ -152,7 +162,7 @@ int DPDT::loop(const int eflag, const int vflag) {
                           &this->_tstat_only, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj,
+    this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj, &sp_sqrt,
                      &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt,
diff --git a/lib/gpu/lal_dpd.cu b/lib/gpu/lal_dpd.cu
index 2794110a92..0c861f51de 100644
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@@ -165,6 +165,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
                     const __global numtyp4 *restrict coeff,
                     const int lj_types,
                     const __global numtyp *restrict sp_lj,
+                    const __global numtyp *restrict sp_sqrt,
                     const __global int * dev_nbor,
                     const __global int * dev_packed,
                     __global acctyp4 *restrict ans,
@@ -200,11 +201,12 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
     numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i];
     int itag=iv.w;
 
-    numtyp factor_dpd;
+    numtyp factor_dpd, factor_sqrt;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
       int j=dev_packed[nbor];
       factor_dpd = sp_lj[sbmask(j)];
+      factor_sqrt = sp_sqrt[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -245,8 +247,9 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
         numtyp force = (numtyp)0.0;
         if (!tstat_only) force = coeff[mtype].x*wd;
         force -= coeff[mtype].y*wd*wd*dot*rinv;
-        force += coeff[mtype].z*wd*randnum*dtinvsqrt;
-        force*=factor_dpd*rinv;
+        force *= factor_dpd;
+        force += factor_sqrt*coeff[mtype].z*wd*randnum*dtinvsqrt;
+        force*=rinv;
 
         f.x+=delx*force;
         f.y+=dely*force;
@@ -278,6 +281,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
 __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict coeff_in,
                          const __global numtyp *restrict sp_lj_in,
+                         const __global numtyp *restrict sp_sqrt_in,
                          const __global int * dev_nbor,
                          const __global int * dev_packed,
                          __global acctyp4 *restrict ans,
@@ -295,8 +299,10 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
   #ifndef ONETYPE
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  __local numtyp sp_sqrt[4];
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
+    sp_sqrt[tid]=sp_sqrt_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
@@ -333,12 +339,15 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
     numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i];
     int itag=iv.w;
 
-    numtyp factor_dpd;
+    #ifndef ONETYPE
+    numtyp factor_dpd, factor_sqrt;
+    #endif
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
       int j=dev_packed[nbor];
       #ifndef ONETYPE
       factor_dpd = sp_lj[sbmask(j)];
+      factor_sqrt = sp_sqrt[sbmask(j)];
       j &= NEIGHMASK;
       #endif
 
@@ -390,12 +399,13 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
         numtyp force = (numtyp)0.0;
         if (!tstat_only) force = coeffx*wd;
         force -= coeffy*wd*wd*dot*rinv;
-        force += coeffz*wd*randnum*dtinvsqrt;
         #ifndef ONETYPE
-        force*=factor_dpd*rinv;
+        force *= factor_dpd;
+        force += factor_sqrt*coeffz*wd*randnum*dtinvsqrt;
         #else
-        force*=rinv;
+        force += coeffz*wd*randnum*dtinvsqrt;
         #endif
+        force*=rinv;
 
         f.x+=delx*force;
         f.y+=dely*force;
diff --git a/lib/gpu/lal_dpd.h b/lib/gpu/lal_dpd.h
index be93d988a3..00890192a0 100644
--- a/lib/gpu/lal_dpd.h
+++ b/lib/gpu/lal_dpd.h
@@ -65,7 +65,7 @@ class DPD : public BaseDPD<numtyp, acctyp> {
   UCL_D_Vec<numtyp> cutsq;
 
   /// Special LJ values
-  UCL_D_Vec<numtyp> sp_lj;
+  UCL_D_Vec<numtyp> sp_lj, sp_sqrt;
 
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
diff --git a/src/GPU/pair_dpd_gpu.cpp b/src/GPU/pair_dpd_gpu.cpp
index 79fd3f1345..716978deac 100644
--- a/src/GPU/pair_dpd_gpu.cpp
+++ b/src/GPU/pair_dpd_gpu.cpp
@@ -313,7 +313,7 @@ void PairDPDGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, in
   int i, j, ii, jj, jnum, itype, jtype;
   double xtmp, ytmp, ztmp, delx, dely, delz, evdwl, fpair;
   double vxtmp, vytmp, vztmp, delvx, delvy, delvz;
-  double rsq, r, rinv, dot, wd, randnum, factor_dpd;
+  double rsq, r, rinv, dot, wd, randnum, factor_dpd, factor_sqrt;
   int *jlist;
   tagint itag, jtag;
 
@@ -344,6 +344,7 @@ void PairDPDGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, in
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor_dpd = special_lj[sbmask(j)];
+      factor_sqrt = special_sqrt[sbmask(j)];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
@@ -376,10 +377,11 @@ void PairDPDGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, in
         // drag force = -gamma * wd^2 * (delx dot delv) / r
         // random force = sigma * wd * rnd * dtinvsqrt;
 
-        fpair = a0[itype][jtype] * wd;
-        fpair -= gamma[itype][jtype] * wd * wd * dot * rinv;
-        fpair += sigma[itype][jtype] * wd * randnum * dtinvsqrt;
-        fpair *= factor_dpd * rinv;
+        fpair = a0[itype][jtype]*wd;
+        fpair -= gamma[itype][jtype]*wd*wd*dot*rinv;
+        fpair *= factor_dpd;
+        fpair += factor_sqrt*sigma[itype][jtype]*wd*randnum*dtinvsqrt;
+        fpair *= rinv;
 
         f[i][0] += delx * fpair;
         f[i][1] += dely * fpair;
diff --git a/src/GPU/pair_dpd_tstat_gpu.cpp b/src/GPU/pair_dpd_tstat_gpu.cpp
index 49dacf3f1e..029bf7245e 100644
--- a/src/GPU/pair_dpd_tstat_gpu.cpp
+++ b/src/GPU/pair_dpd_tstat_gpu.cpp
@@ -329,7 +329,7 @@ void PairDPDTstatGPU::cpu_compute(int start, int inum, int /* eflag */, int /* v
   int i, j, ii, jj, jnum, itype, jtype;
   double xtmp, ytmp, ztmp, delx, dely, delz, fpair;
   double vxtmp, vytmp, vztmp, delvx, delvy, delvz;
-  double rsq, r, rinv, dot, wd, randnum, factor_dpd;
+  double rsq, r, rinv, dot, wd, randnum, factor_dpd, factor_sqrt;
   int *jlist;
   tagint itag, jtag;
 
@@ -360,6 +360,7 @@ void PairDPDTstatGPU::cpu_compute(int start, int inum, int /* eflag */, int /* v
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor_dpd = special_lj[sbmask(j)];
+      factor_sqrt = special_sqrt[sbmask(j)];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
@@ -392,9 +393,9 @@ void PairDPDTstatGPU::cpu_compute(int start, int inum, int /* eflag */, int /* v
         // drag force = -gamma * wd^2 * (delx dot delv) / r
         // random force = sigma * wd * rnd * dtinvsqrt;
 
-        fpair = -gamma[itype][jtype] * wd * wd * dot * rinv;
-        fpair += sigma[itype][jtype] * wd * randnum * dtinvsqrt;
-        fpair *= factor_dpd * rinv;
+        fpair = -factor_dpd * gamma[itype][jtype] * wd * wd * dot * rinv;
+        fpair += factor_sqrt * sigma[itype][jtype] * wd * randnum * dtinvsqrt;
+        fpair *= rinv;
 
         f[i][0] += delx * fpair;
         f[i][1] += dely * fpair;