From 67ae64329e185cbd2e860f148e0024d1eb1091e7 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 27 Jun 2014 17:25:22 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12146
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 lib/gpu/Nvidia.makefile       | 32 ++++++++++++++++++++++++--
 lib/gpu/Opencl.makefile       | 25 +++++++++++++++++++--
 lib/gpu/lal_born.cpp          | 19 ++++++++++++++++
 lib/gpu/lal_born.h            |  8 ++++++-
 lib/gpu/lal_born_ext.cpp      | 26 ++++++++++++++++++++++
 lib/gpu/lal_buck.cpp          | 18 +++++++++++++++
 lib/gpu/lal_buck.h            |  5 +++++
 lib/gpu/lal_buck_ext.cpp      | 25 +++++++++++++++++++++
 lib/gpu/lal_coul_long.cpp     | 38 +++++++++++++++++++------------
 lib/gpu/lal_coul_long.cu      | 38 ++++++++++++++++++-------------
 lib/gpu/lal_coul_long.h       | 16 ++++++++-----
 lib/gpu/lal_coul_long_ext.cpp | 42 ++++++++++++++++++++++++++---------
 lib/gpu/lal_gauss.cpp         | 15 +++++++++++++
 lib/gpu/lal_gauss.h           |  8 +++++--
 lib/gpu/lal_gauss_ext.cpp     | 22 ++++++++++++++++++
 lib/gpu/lal_lj.cpp            | 17 ++++++++++++++
 lib/gpu/lal_lj.h              |  7 +++++-
 lib/gpu/lal_lj_expand.cpp     | 20 +++++++++++++++++
 lib/gpu/lal_lj_expand.h       |  7 +++++-
 lib/gpu/lal_lj_expand_ext.cpp | 23 +++++++++++++++++++
 lib/gpu/lal_lj_ext.cpp        | 21 ++++++++++++++++++
 lib/gpu/lal_soft.cpp          | 15 +++++++++++++
 lib/gpu/lal_soft.h            |  8 +++++--
 lib/gpu/lal_soft_ext.cpp      | 22 ++++++++++++++++++
 24 files changed, 422 insertions(+), 55 deletions(-)

diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index da07cec1b7..f38b443845 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -67,7 +67,9 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
        $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
        $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
        $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \
-       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o 
+       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o \
+       $(OBJ_DIR)/lal_coul.o $(OBJ_DIR)/lal_coul_ext.o \
+       $(OBJ_DIR)/lal_coul_debye.o $(OBJ_DIR)/lal_coul_debye_ext.o
 
 CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
        $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@@ -114,7 +116,9 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
        $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
        $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \
        $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h \
-       $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd_cubin.h
+       $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd_cubin.h \
+       $(OBJ_DIR)/coul.cubin $(OBJ_DIR)/coul_cubin.h \
+       $(OBJ_DIR)/coul_debye.cubin $(OBJ_DIR)/coul_debye_cubin.h
 
 all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
 
@@ -676,6 +680,30 @@ $(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp $(OBJ_DIR)/dpd_cubin.h $(OB
 $(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h
 	$(CUDR) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/coul.cubin: lal_coul.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul.cu
+
+$(OBJ_DIR)/coul_cubin.h: $(OBJ_DIR)/coul.cubin $(OBJ_DIR)/coul.cubin
+	$(BIN2C) -c -n coul $(OBJ_DIR)/coul.cubin > $(OBJ_DIR)/coul_cubin.h
+
+$(OBJ_DIR)/lal_coul.o: $(ALL_H) lal_coul.h lal_coul.cpp $(OBJ_DIR)/coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_coul.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_ext.o: $(ALL_H) lal_coul.h lal_coul_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_coul_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_debye.cubin: lal_coul_debye.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_debye.cu
+
+$(OBJ_DIR)/coul_debye_cubin.h: $(OBJ_DIR)/coul_debye.cubin $(OBJ_DIR)/coul_debye.cubin
+	$(BIN2C) -c -n coul_debye $(OBJ_DIR)/coul_debye.cubin > $(OBJ_DIR)/coul_debye_cubin.h
+
+$(OBJ_DIR)/lal_coul_debye.o: $(ALL_H) lal_coul_debye.h lal_coul_debye.cpp $(OBJ_DIR)/coul_debye_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_coul_debye.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_debye_ext.o: $(ALL_H) lal_coul_debye.h lal_coul_debye_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_coul_debye_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 
 
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 1d85c65354..b54704f63c 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -56,7 +56,9 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
        $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
        $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
        $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \
-       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o 
+       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o \
+       $(OBJ_DIR)/lal_coul.o $(OBJ_DIR)/lal_coul_ext.o \
+       $(OBJ_DIR)/lal_coul_debye.o $(OBJ_DIR)/lal_coul_debye_ext.o
 
 KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
        $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@@ -79,7 +81,8 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
        $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
        $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
        $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \
-       $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/dpd_cl.h
+       $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/dpd_cl.h \
+       $(OBJ_DIR)/coul_cl.h $(OBJ_DIR)/coul_debye_cl.h
 
 
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@@ -485,6 +488,24 @@ $(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp  $(OBJ_DIR)/dpd_cl.h $(OBJ_
 $(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h
 	$(OCL) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/coul_cl.h: lal_coul.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh coul $(PRE1_H) lal_coul.cu $(OBJ_DIR)/coul_cl.h;
+
+$(OBJ_DIR)/lal_coul.o: $(ALL_H) lal_coul.h lal_coul.cpp  $(OBJ_DIR)/coul_cl.h $(OBJ_DIR)/coul_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_coul.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_ext.o: $(ALL_H) lal_coul.h lal_coul_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_coul_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_debye_cl.h: lal_coul_debye.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh coul_debye $(PRE1_H) lal_coul_debye.cu $(OBJ_DIR)/coul_debye_cl.h;
+
+$(OBJ_DIR)/lal_coul_debye.o: $(ALL_H) lal_coul_debye.h lal_coul_debye.cpp  $(OBJ_DIR)/coul_debye_cl.h $(OBJ_DIR)/coul_debye_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_coul_debye.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_debye_ext.o: $(ALL_H) lal_coul_debye.h lal_coul_debye_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_coul_debye_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 
 
diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp
index 719b1845d8..55cb24d3b0 100644
--- a/lib/gpu/lal_born.cpp
+++ b/lib/gpu/lal_born.cpp
@@ -97,6 +97,25 @@ int BornT::init(const int ntypes, double **host_cutsq,
   return 0;
 }
 
+template <class numtyp, class acctyp>
+void BornT::reinit(const int ntypes, double **host_rhoinv,
+                   double **host_born1, double **host_born2,
+                   double **host_born3, double **host_a, double **host_c,
+                   double **host_d, double **host_offset) {
+  
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
+                         host_born1,host_born2,host_born3);
+  this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
+		                     host_d,host_offset);
+}
+
 template <class numtyp, class acctyp>
 void BornT::clear() {
   if (!_allocated)
diff --git a/lib/gpu/lal_born.h b/lib/gpu/lal_born.h
index fe9c2a1819..6fed6461d2 100644
--- a/lib/gpu/lal_born.h
+++ b/lib/gpu/lal_born.h
@@ -45,7 +45,13 @@ class Born : public BaseAtomic<numtyp, acctyp> {
            const int nlocal, const int nall, const int max_nbors, 
            const int maxspecial, const double cell_size, 
            const double gpu_split, FILE *screen);
-
+  
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_rhoinv,
+              double **host_born1, double **host_born2,
+              double **host_born3, double **host_a, double **host_c,
+              double **host_d, double **host_offset);
+       
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp
index 3b593efb23..4dcc899623 100644
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@@ -92,6 +92,32 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   return init_ok;
 }
 
+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void born_gpu_reinit(const int ntypes, double **host_rhoinv,
+                     double **host_born1, double **host_born2,
+                     double **host_born3, double **host_a, double **host_c,
+                     double **host_d, double **offset) {
+  int world_me=BORNMF.device->world_me();
+  int gpu_rank=BORNMF.device->gpu_rank();
+  int procs_per_gpu=BORNMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
+                  host_born3, host_a, host_c, host_d, offset);
+  
+  BORNMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
+                    host_born3, host_a, host_c, host_d, offset);
+    
+    BORNMF.device->gpu_barrier();
+  }
+}
+
 void born_gpu_clear() {
   BORNMF.clear(); 
 }
diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp
index 53bfe3fae3..f66759ee3a 100644
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@@ -91,6 +91,24 @@ int BuckT::init(const int ntypes, double **host_cutsq,
   return 0;
 }
 
+template <class numtyp, class acctyp>
+void BuckT::reinit(const int ntypes, double **host_cutsq,
+                   double **host_rhoinv, double **host_buck1, double **host_buck2,
+                   double **host_a, double **host_c, double **host_offset) {
+  
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
+                         host_buck1,host_buck2,host_cutsq);
+  this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
+                         host_offset);
+}
+
 template <class numtyp, class acctyp>
 void BuckT::clear() {
   if (!_allocated)
diff --git a/lib/gpu/lal_buck.h b/lib/gpu/lal_buck.h
index b0b6347c47..ebcd72d990 100644
--- a/lib/gpu/lal_buck.h
+++ b/lib/gpu/lal_buck.h
@@ -45,6 +45,11 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
            const int maxspecial, const double cell_size, 
            const double gpu_split, FILE *screen);
 
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+            double **host_rhoinv, double **host_buck1, double **host_buck2,
+            double **host_a, double **host_c, double **host_offset);
+  
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp
index af1f826751..b118a5b0eb 100644
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@@ -89,6 +89,31 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   return init_ok;
 }
 
+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv,
+                  double **host_buck1, double **host_buck2,
+                  double **host_a, double **host_c, double **offset) {
+  int world_me=BUCKMF.device->world_me();
+  int gpu_rank=BUCKMF.device->gpu_rank();
+  int procs_per_gpu=BUCKMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
+                  host_a, host_c, offset);
+  
+  BUCKMF.device->world_barrier();
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
+                    host_a, host_c, offset);
+    
+    BUCKMF.device->gpu_barrier();
+  }
+}
+
 void buck_gpu_clear() {
   BUCKMF.clear(); 
 }
diff --git a/lib/gpu/lal_coul_long.cpp b/lib/gpu/lal_coul_long.cpp
index 53de36474d..ab0c8ce665 100644
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@@ -43,21 +43,19 @@ int CoulLongT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
-			 const int maxspecial, const double cell_size,
-			 const double gpu_split, FILE *_screen,
-			 const double host_cut_coulsq, double *host_special_coul,
-			 const double qqrd2e, const double g_ewald) {
+int CoulLongT::init(const int ntypes, double **host_scale,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const double cell_size,
+                    const double gpu_split, FILE *_screen,
+                    const double host_cut_coulsq, double *host_special_coul,
+                    const double qqrd2e, const double g_ewald) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
 			                      gpu_split,_screen,coul_long,"k_coul_long");
   if (success!=0)
     return success;
 
-  // we don't have atom types for coulomb only,
-  // but go with the minimum so that we can use
-  // the same infrastructure as lj/cut/coul/long/gpu.
-  int lj_types=1;
+  int lj_types=ntypes;
   shared_types=false;
   int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
@@ -69,13 +67,16 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-
+  
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-
+  
+  scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale);
+  
   sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_coul[i];
@@ -87,10 +88,18 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
   _g_ewald=g_ewald;
 
   _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_cl.row_bytes();
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+scale.row_bytes()+
+    sp_cl.row_bytes();
   return 0;
 }
 
+template <class numtyp, class acctyp>
+void CoulLongT::reinit(const int ntypes, double **host_scale) {
+  UCL_H_Vec<numtyp> hscale(_lj_types*_lj_types,*(this->ucl_device),
+                           UCL_WRITE_ONLY);
+  this->atom->type_pack1(ntypes,_lj_types,scale,hscale,host_scale);
+}
+
 template <class numtyp, class acctyp>
 void CoulLongT::clear() {
   if (!_allocated)
@@ -99,6 +108,7 @@ void CoulLongT::clear() {
 
   lj1.clear();
   lj3.clear();
+  scale.clear();
   sp_cl.clear();
   this->clear_atomic();
 }
@@ -134,7 +144,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_cl,
+    this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv,
                           &eflag, &vflag, &ainum, &nbor_pitch, 
@@ -142,7 +152,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_cl,
+    this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
                      &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu
index 5eb89f9c42..fcc69c44ab 100644
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@@ -124,8 +124,7 @@ texture<int2> q_tex;
 #endif
 
 __kernel void k_coul_long(const __global numtyp4 *restrict x_, 
-                          const __global numtyp4 *restrict lj1,
-                          const __global numtyp4 *restrict lj3, 
+                          const __global numtyp *restrict scale,
                           const int lj_types,
                           const __global numtyp *restrict sp_cl_in, 
                           const __global int *dev_nbor,
@@ -161,6 +160,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
               n_stride,list_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
     for ( ; nbor<list_end; nbor+=n_stride) {
@@ -171,24 +171,26 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-
+      int jtype=jx.w;
+      
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-
+      
+      int mtype=itype*lj_types+jtype;
       if (rsq < cut_coulsq) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp force, prefactor, _erfc;
-
+        
         numtyp r = ucl_rsqrt(r2inv);
         numtyp grij = g_ewald * r;
         numtyp expm2 = ucl_exp(-grij*grij);
         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
         _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
         fetch(prefactor,j,q_tex);
-        prefactor *= qqrd2e * qtmp/r;
+        prefactor *= qqrd2e * scale[mtype] * qtmp/r;
         force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
 
         f.x+=delx*force;
@@ -196,7 +198,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
         f.z+=delz*force;
 
         if (eflag>0) {
-	  e_coul += prefactor*(_erfc-factor_coul);
+	        e_coul += prefactor*(_erfc-factor_coul);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -215,8 +217,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
 }
 
 __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, 
-                               const __global numtyp4 *restrict lj1_in,
-                               const __global numtyp4 *restrict lj3_in,
+                               const __global numtyp *restrict scale_in,
                                const __global numtyp *restrict sp_cl_in,
                                const __global int *dev_nbor, 
                                const __global int *dev_packed,
@@ -230,10 +231,14 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
+  __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_cl[4];
   if (tid<4)
     sp_cl[tid]=sp_cl_in[tid];
-
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    scale[tid]=scale_in[tid];
+  }
+  
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -252,7 +257,9 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
-
+    int iw = ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
@@ -261,7 +268,8 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-
+      int mtype=itype+jx.w;
+      
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
@@ -272,13 +280,13 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
         numtyp r2inv=ucl_recip(rsq);
         numtyp force, prefactor, _erfc;
 
-        numtyp r = ucl_rsqrt(r2inv);
+        numtyp r = ucl_sqrt(rsq);
         numtyp grij = g_ewald * r;
         numtyp expm2 = ucl_exp(-grij*grij);
         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
         _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
         fetch(prefactor,j,q_tex);
-        prefactor *= qqrd2e * qtmp/r;
+        prefactor *= qqrd2e * scale[mtype] * qtmp/r;
         force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
 
         f.x+=delx*force;
@@ -286,7 +294,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
         f.z+=delz*force;
 
         if (eflag>0) {
-	  e_coul += prefactor*(_erfc-factor_coul);
+	        e_coul += prefactor*(_erfc-factor_coul);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_coul_long.h b/lib/gpu/lal_coul_long.h
index 62103e7266..7e3f7f36a7 100644
--- a/lib/gpu/lal_coul_long.h
+++ b/lib/gpu/lal_coul_long.h
@@ -37,12 +37,16 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int nlocal, const int nall, const int max_nbors,
+  int init(const int ntypes, double **scale,
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
-	   const double gpu_split, FILE *screen, 
-	   const double host_cut_coulsq, double *host_special_coul,
-	   const double qqrd2e, const double g_ewald);
-
+	         const double gpu_split, FILE *screen,
+	         const double host_cut_coulsq, double *host_special_coul,
+	         const double qqrd2e, const double g_ewald);
+  
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **scale);
+  
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -59,6 +63,8 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
   UCL_D_Vec<numtyp4> lj1;
   /// lj3 dummy
   UCL_D_Vec<numtyp4> lj3;
+  /// scale
+  UCL_D_Vec<numtyp> scale;
   /// Special Coul values [0-3]
   UCL_D_Vec<numtyp> sp_cl;
 
diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp
index 904c33527f..49d593d9a0 100644
--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@@ -27,10 +27,11 @@ static CoulLong<PRECISION,ACC_PRECISION> CLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int cl_gpu_init(const int inum, const int nall, const int max_nbors,
-		const int maxspecial, const double cell_size, int &gpu_mode,
-		FILE *screen, double host_cut_coulsq, double *host_special_coul,
-		const double qqrd2e, const double g_ewald) {
+int cl_gpu_init(const int ntypes, double **host_scale,
+                const int inum, const int nall, const int max_nbors,
+                const int maxspecial, const double cell_size, int &gpu_mode,
+                FILE *screen, double host_cut_coulsq, double *host_special_coul,
+                const double qqrd2e, const double g_ewald) {
   CLMF.clear();
   gpu_mode=CLMF.device->gpu_mode();
   double gpu_split=CLMF.device->particle_split();
@@ -53,9 +54,9 @@ int cl_gpu_init(const int inum, const int nall, const int max_nbors,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split,
-		      screen, host_cut_coulsq, host_special_coul, qqrd2e,
-		      g_ewald);
+    init_ok=CLMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+                      cell_size, gpu_split, screen, host_cut_coulsq,
+                      host_special_coul, qqrd2e, g_ewald);
 
   CLMF.device->world_barrier();
   if (message)
@@ -71,9 +72,9 @@ int cl_gpu_init(const int inum, const int nall, const int max_nbors,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split,
-			screen, host_cut_coulsq, host_special_coul,
-			qqrd2e, g_ewald);
+      init_ok=CLMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+                        cell_size, gpu_split, screen, host_cut_coulsq,
+                        host_special_coul, qqrd2e, g_ewald);
 
     CLMF.device->gpu_barrier();
     if (message)
@@ -87,6 +88,27 @@ int cl_gpu_init(const int inum, const int nall, const int max_nbors,
   return init_ok;
 }
 
+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void cl_gpu_reinit(const int ntypes, double **host_scale) {
+  int world_me=CLMF.device->world_me();
+  int gpu_rank=CLMF.device->gpu_rank();
+  int procs_per_gpu=CLMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    CLMF.reinit(ntypes, host_scale);
+  
+  CLMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      CLMF.reinit(ntypes, host_scale);
+    
+    CLMF.device->gpu_barrier();
+  }
+}
+
 void cl_gpu_clear() {
   CLMF.clear();
 }
diff --git a/lib/gpu/lal_gauss.cpp b/lib/gpu/lal_gauss.cpp
index 9e296e6872..342ec4ecda 100644
--- a/lib/gpu/lal_gauss.cpp
+++ b/lib/gpu/lal_gauss.cpp
@@ -87,6 +87,21 @@ int GaussT::init(const int ntypes,
   return 0;
 }
 
+template <class numtyp, class acctyp>
+void GaussT::reinit(const int ntypes, double **host_cutsq, double **host_a,
+                    double **host_b, double **host_offset) {
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,gauss1,host_write,host_a,host_b,
+			                   host_cutsq,host_offset);
+}
+
 template <class numtyp, class acctyp>
 void GaussT::clear() {
   if (!_allocated)
diff --git a/lib/gpu/lal_gauss.h b/lib/gpu/lal_gauss.h
index fad790899d..1fd58adae5 100644
--- a/lib/gpu/lal_gauss.h
+++ b/lib/gpu/lal_gauss.h
@@ -14,7 +14,7 @@
  ***************************************************************************/
 
 #ifndef LAL_GAUSS_H
-#define LAL_GAYSS_H
+#define LAL_GAUSS_H
 
 #include "lal_base_atomic.h"
 
@@ -43,7 +43,11 @@ class Gauss : public BaseAtomic<numtyp, acctyp> {
            const int nlocal, const int nall, const int max_nbors, 
            const int maxspecial, const double cell_size, 
            const double gpu_split, FILE *screen);
-
+  
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+              double **host_a, double **host_b, double **host_offset);
+           
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp
index 818c7cbd26..fdef451366 100644
--- a/lib/gpu/lal_gauss_ext.cpp
+++ b/lib/gpu/lal_gauss_ext.cpp
@@ -88,6 +88,28 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
   return init_ok;
 }
 
+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void gauss_gpu_reinit(const int ntypes, double **cutsq, double **host_a,
+                      double **host_b, double **offset) {
+  int world_me=GLMF.device->world_me();
+  int gpu_rank=GLMF.device->gpu_rank();
+  int procs_per_gpu=GLMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    GLMF.reinit(ntypes, cutsq, host_a, host_b, offset);
+  
+  GLMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      GLMF.reinit(ntypes, cutsq, host_a, host_b, offset);
+    
+    GLMF.device->gpu_barrier();
+  }
+}
+
 void gauss_gpu_clear() {
   GLMF.clear();
 }
diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp
index 7cc3ee48b3..6c6e145319 100644
--- a/lib/gpu/lal_lj.cpp
+++ b/lib/gpu/lal_lj.cpp
@@ -92,6 +92,23 @@ int LJT::init(const int ntypes,
   return 0;
 }
 
+template <class numtyp, class acctyp>
+void LJT::reinit(const int ntypes, double **host_cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3,
+                 double **host_lj4, double **host_offset) {
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_cutsq);
+  this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
+                         host_offset);
+}
+
 template <class numtyp, class acctyp>
 void LJT::clear() {
   if (!_allocated)
diff --git a/lib/gpu/lal_lj.h b/lib/gpu/lal_lj.h
index 9555a3c881..63a3e8a6c9 100644
--- a/lib/gpu/lal_lj.h
+++ b/lib/gpu/lal_lj.h
@@ -43,7 +43,12 @@ class LJ : public BaseAtomic<numtyp, acctyp> {
            const int nlocal, const int nall, const int max_nbors, 
            const int maxspecial, const double cell_size, 
            const double gpu_split, FILE *screen);
-
+  
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+              double **host_lj1, double **host_lj2, double **host_lj3,
+              double **host_lj4, double **host_offset);
+  
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
diff --git a/lib/gpu/lal_lj_expand.cpp b/lib/gpu/lal_lj_expand.cpp
index 474a91c6b3..03526bc095 100644
--- a/lib/gpu/lal_lj_expand.cpp
+++ b/lib/gpu/lal_lj_expand.cpp
@@ -92,6 +92,26 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
   return 0;
 }
 
+template <class numtyp, class acctyp>
+void LJExpandT::reinit(const int ntypes, double **host_cutsq,
+                       double **host_lj1, double **host_lj2,
+                       double **host_lj3, double **host_lj4,
+                       double **host_offset, double **host_shift) {
+  
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_cutsq, host_shift);
+  
+  this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
+                         host_offset);
+}
+
 template <class numtyp, class acctyp>
 void LJExpandT::clear() {
   if (!_allocated)
diff --git a/lib/gpu/lal_lj_expand.h b/lib/gpu/lal_lj_expand.h
index fd59045bf8..0d0ae0b2e6 100644
--- a/lib/gpu/lal_lj_expand.h
+++ b/lib/gpu/lal_lj_expand.h
@@ -43,7 +43,12 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
            const int nlocal, const int nall, const int max_nbors, 
            const int maxspecial, const double cell_size, 
            const double gpu_split, FILE *screen);
-
+  
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+              double **host_lj1, double **host_lj2, double **host_lj3,
+              double **host_lj4, double **host_offset, double **host_shift);
+  
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp
index 725004b0ef..c1cc99fd12 100644
--- a/lib/gpu/lal_lj_expand_ext.cpp
+++ b/lib/gpu/lal_lj_expand_ext.cpp
@@ -89,6 +89,29 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   return init_ok;
 }
 
+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+int lje_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4,
+                   double **offset, double **shift) {
+  int world_me=LJEMF.device->world_me();
+  int gpu_rank=LJEMF.device->gpu_rank();
+  int procs_per_gpu=LJEMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                offset, shift);
+  LJEMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                   offset, shift);
+    LJEMF.device->gpu_barrier();
+  }
+}
+
 void lje_gpu_clear() {
   LJEMF.clear();
 }
diff --git a/lib/gpu/lal_lj_ext.cpp b/lib/gpu/lal_lj_ext.cpp
index 3d85f689d0..674c1984bc 100644
--- a/lib/gpu/lal_lj_ext.cpp
+++ b/lib/gpu/lal_lj_ext.cpp
@@ -88,6 +88,27 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   return init_ok;
 }
 
+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
+                    double **host_lj2, double **host_lj3, double **host_lj4,
+                    double **offset) {
+  int world_me=LJLMF.device->world_me();
+  int gpu_rank=LJLMF.device->gpu_rank();
+  int procs_per_gpu=LJLMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
+  LJLMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
+    LJLMF.device->gpu_barrier();
+  }
+}
+
 void ljl_gpu_clear() {
   LJLMF.clear();
 }
diff --git a/lib/gpu/lal_soft.cpp b/lib/gpu/lal_soft.cpp
index 8db144080e..c206a997a9 100644
--- a/lib/gpu/lal_soft.cpp
+++ b/lib/gpu/lal_soft.cpp
@@ -86,6 +86,21 @@ int SoftT::init(const int ntypes, double **host_cutsq,
   return 0;
 }
 
+template <class numtyp, class acctyp>
+void SoftT::reinit(const int ntypes, double **host_cutsq,
+                   double **host_prefactor, double **host_cut) {
+  
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_prefactor,
+			                   host_cut,host_cutsq);
+}
+
 template <class numtyp, class acctyp>
 void SoftT::clear() {
   if (!_allocated)
diff --git a/lib/gpu/lal_soft.h b/lib/gpu/lal_soft.h
index 40e568e847..7fa529c4f5 100644
--- a/lib/gpu/lal_soft.h
+++ b/lib/gpu/lal_soft.h
@@ -13,8 +13,8 @@
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
-#ifndef LAL_GAUSS_H
-#define LAL_GAYSS_H
+#ifndef LAL_SOFT_H
+#define LAL_SOFT_H
 
 #include "lal_base_atomic.h"
 
@@ -44,6 +44,10 @@ class Soft : public BaseAtomic<numtyp, acctyp> {
            const int maxspecial, const double cell_size, 
            const double gpu_split, FILE *screen);
 
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+              double **host_prefactor, double **host_cut);
+           
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
diff --git a/lib/gpu/lal_soft_ext.cpp b/lib/gpu/lal_soft_ext.cpp
index 1872388bb3..5e7f38d054 100644
--- a/lib/gpu/lal_soft_ext.cpp
+++ b/lib/gpu/lal_soft_ext.cpp
@@ -88,6 +88,28 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
   return init_ok;
 }
 
+// ---------------------------------------------------------------------------
+// Copy updated constants to device
+// ---------------------------------------------------------------------------
+void soft_gpu_reinit(const int ntypes, double **cutsq, double **host_prefactor,
+                    double **host_cut) {
+  int world_me=SLMF.device->world_me();
+  int gpu_rank=SLMF.device->gpu_rank();
+  int procs_per_gpu=SLMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut);
+  
+  SLMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut);
+    
+    SLMF.device->gpu_barrier();
+  }
+}
+
 void soft_gpu_clear() {
   SLMF.clear();
 }