From bbffbe06a229a590abed9ba91fe850f1b00c71b5 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Mon, 13 Jan 2014 15:51:54 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@11230
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 lib/gpu/Nvidia.makefile        |  18 +-
 lib/gpu/Opencl.makefile        |  15 +-
 lib/gpu/lal_answer.cpp         |  35 ++--
 lib/gpu/lal_buck_coul_long.cpp |   1 +
 lib/gpu/lal_coul_dsf.cpp       |   2 +-
 lib/gpu/lal_coul_dsf.h         |   2 +-
 lib/gpu/lal_sw.cpp             | 172 ++++++++++++-----
 lib/gpu/lal_sw.cu              | 327 ++++++++++++++++++++++-----------
 lib/gpu/lal_sw.h               |  32 +++-
 lib/gpu/lal_sw_ext.cpp         |  27 +--
 10 files changed, 438 insertions(+), 193 deletions(-)

diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index a5df7bf23d..e93805df3b 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -64,7 +64,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
        $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
        $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
        $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
-       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
+       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
+       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
 
 CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
        $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@@ -109,7 +110,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
        $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \
        $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
        $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
-       $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h
+       $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \
+       $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h
 
 all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
 
@@ -644,6 +646,18 @@ $(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(O
 $(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/lj_gromacs.cubin: lal_lj_gromacs.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_gromacs.cu
+
+$(OBJ_DIR)/lj_gromacs_cubin.h: $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs.cubin
+	$(BIN2C) -c -n lj_gromacs $(OBJ_DIR)/lj_gromacs.cubin > $(OBJ_DIR)/lj_gromacs_cubin.h
+
+$(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_DIR)/lj_gromacs_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 
 
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 81e80588cd..bb528f1e2b 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -53,7 +53,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
        $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
        $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
        $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
-       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
+       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
+       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
 
 KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
        $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@@ -75,7 +76,8 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
        $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
        $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
        $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
-       $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h
+       $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \
+       $(OBJ_DIR)/lj_gromacs_cl.h
 
 
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@@ -460,6 +462,15 @@ $(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp  $(
 $(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/lj_gromacs_cl.h: lal_lj_gromacs.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh lj_gromacs $(PRE1_H) lal_lj_gromacs.cu $(OBJ_DIR)/lj_gromacs_cl.h;
+
+$(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp  $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 
 
diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp
index 7dbd875cd1..ddf893e4ed 100644
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@@ -150,7 +150,7 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
     csize-=_e_fields;
   if (!vflag)
     csize-=6;
-      
+
   if (csize>0)
     engv.update_host(_inum*csize,true);
   if (_rot)
@@ -194,12 +194,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
       for (int i=vstart; i<iend; i++)
         virial[j]+=engv[i];
       if (_vf_atom)
-        if (_ilist==NULL)
-          for (int i=vstart; i<iend; i++)
-            vatom[i][j]+=engv[i];
-        else
-          for (int i=vstart; i<iend; i++)
-            vatom[_ilist[i]][j]+=engv[i];
+        if (_ilist==NULL) {
+          int ii=0;
+          for (int i=vstart; i<iend; i++) 
+            vatom[ii++][j]+=engv[i];
+        } else {
+          int ii=0;
+          for (int i=vstart; i<iend; i++) 
+            vatom[_ilist[ii++]][j]+=engv[i];
+        }
       vstart+=_inum;
       iend+=_inum;
     }
@@ -218,8 +221,9 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
     return energy_virial(eatom,vatom,virial);
 
   double evdwl=0.0;
-  int vstart=0, iend=_inum*2;
+  int vstart=0, iend=_inum;
   if (_eflag) {
+    iend=_inum*2;
     for (int i=0; i<_inum; i++)
       evdwl+=engv[i];
     for (int i=_inum; i<iend; i++)
@@ -238,18 +242,21 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
       }
     vstart=iend;
     iend+=_inum;
-  }
-  if (_vflag) {
+  } 
+  if (_vflag) { 
     for (int j=0; j<6; j++) {
       for (int i=vstart; i<iend; i++)
         virial[j]+=engv[i];
       if (_vf_atom)
-        if (_ilist==NULL)
+        if (_ilist==NULL) {
+          int ii=0;
           for (int i=vstart; i<iend; i++)
-              vatom[i][j]+=engv[i];
-        else
+            vatom[ii++][j]+=engv[i];
+        } else {
+          int ii=0;
           for (int i=vstart; i<iend; i++)
-              vatom[_ilist[i]][j]+=engv[i];
+            vatom[_ilist[ii++]][j]+=engv[i];
+        }  
       vstart+=_inum;
       iend+=_inum;
     }
diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp
index 98c251714f..4aa720132a 100644
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@@ -112,6 +112,7 @@ void BuckCoulLongT::clear() {
 
   coeff1.clear();
   coeff2.clear();
+  cutsq.clear();
   sp_lj.clear();
   this->clear_atomic();
 }
diff --git a/lib/gpu/lal_coul_dsf.cpp b/lib/gpu/lal_coul_dsf.cpp
index 4f338eb051..ca81d32b2d 100644
--- a/lib/gpu/lal_coul_dsf.cpp
+++ b/lib/gpu/lal_coul_dsf.cpp
@@ -10,7 +10,7 @@
  __________________________________________________________________________
 
     begin                : 8/15/2012
-    email                : nguyentdw@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #if defined(USE_OPENCL)
diff --git a/lib/gpu/lal_coul_dsf.h b/lib/gpu/lal_coul_dsf.h
index 0579ab46d5..0c5b063026 100644
--- a/lib/gpu/lal_coul_dsf.h
+++ b/lib/gpu/lal_coul_dsf.h
@@ -10,7 +10,7 @@
  __________________________________________________________________________
 
     begin                : 8/15/2012
-    email                : nguyentdw@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #ifndef LAL_LJ_DSF_H
diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp
index bf6f820b69..27974874b0 100644
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@@ -16,7 +16,7 @@
 #if defined(USE_OPENCL)
 #include "sw_cl.h"
 #elif defined(USE_CUDART)
-const char *lj=0;
+const char *sw=0;
 #else
 #include "sw_cubin.h"
 #endif
@@ -43,28 +43,15 @@ int SWT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-int SWT::init(const int nlocal, const int nall, const int max_nbors, 
-              const double cell_size, const double gpu_split, FILE *_screen,
-              const double epsilon, const double sigma,
-              const double lambda, const double gamma,
-              const double costheta, const double biga,
-              const double bigb, const double powerp,
-              const double powerq, const double cut, const double cutsq) {
-
-  sw_epsilon=static_cast<numtyp>(epsilon);
-  sw_sigma=static_cast<numtyp>(sigma);
-  sw_lambda=static_cast<numtyp>(lambda);
-  sw_gamma=static_cast<numtyp>(gamma);
-  sw_costheta=static_cast<numtyp>(costheta);
-  sw_biga=static_cast<numtyp>(biga);
-  sw_bigb=static_cast<numtyp>(bigb);
-  sw_powerp=static_cast<numtyp>(powerp);
-  sw_powerq=static_cast<numtyp>(powerq);
-  sw_cut=static_cast<numtyp>(cut);
-  sw_cutsq=static_cast<numtyp>(cutsq);
-  if (sw_cutsq>=sw_cut*sw_cut) 
-    sw_cutsq=sw_cut*sw_cut-1e-4;
-
+int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
+           const double cell_size, const double gpu_split, FILE *_screen,
+           int* host_map, const int nelements, int*** host_elem2param, const int nparams, 
+           const double* epsilon, const double* sigma,
+           const double* lambda, const double* gamma,
+           const double* costheta, const double* biga,
+           const double* bigb, const double* powerp,
+           const double* powerq, const double* cut, const double* cutsq)
+{
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,sw,"k_sw","k_sw_three_center",
@@ -73,10 +60,93 @@ int SWT::init(const int nlocal, const int nall, const int max_nbors,
     return success;
 
   // If atom type constants fit in shared memory use fast kernel
-  shared_types=true;
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  _nparams = nparams;
+  _nelements = nelements;
+
+  UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
+                             UCL_WRITE_ONLY);
+
+  for (int i=0; i<nparams; i++) {
+    dview[i].x=(numtyp)0; 
+    dview[i].y=(numtyp)0;
+    dview[i].z=(numtyp)0; 
+    dview[i].w=(numtyp)0;
+  }
+
+  // pack coefficients into arrays
+  sw1.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
+  
+  for (int i=0; i<nparams; i++) {
+    dview[i].x=static_cast<numtyp>(epsilon[i]);
+    dview[i].y=static_cast<numtyp>(sigma[i]);
+    dview[i].z=static_cast<numtyp>(lambda[i]);
+    dview[i].w=static_cast<numtyp>(gamma[i]);
+  }
+  
+  ucl_copy(sw1,dview,false);
+  sw1_tex.get_texture(*(this->pair_program),"sw1_tex");
+  sw1_tex.bind_float(sw1,4);
+
+  sw2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
+  
+  for (int i=0; i<nparams; i++) {
+    dview[i].x=static_cast<numtyp>(biga[i]);
+    dview[i].y=static_cast<numtyp>(bigb[i]);
+    dview[i].z=static_cast<numtyp>(powerp[i]);
+    dview[i].w=static_cast<numtyp>(powerq[i]);
+  }
+  
+  ucl_copy(sw2,dview,false);
+  sw2_tex.get_texture(*(this->pair_program),"sw2_tex");
+  sw2_tex.bind_float(sw2,4);
+
+  sw3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
+  
+  for (int i=0; i<nparams; i++) {
+    dview[i].x=static_cast<numtyp>(cut[i]);
+    dview[i].y=static_cast<numtyp>(cutsq[i]);
+    dview[i].z=static_cast<numtyp>(costheta[i]);
+    dview[i].w=(numtyp)0;
+  }
+  
+  ucl_copy(sw3,dview,false);
+  sw3_tex.get_texture(*(this->pair_program),"sw3_tex");
+  sw3_tex.bind_float(sw3,4);
+
+  UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
+                           *(this->ucl_device), UCL_WRITE_ONLY);
+
+  elem2param.alloc(nelements*nelements*nelements,*(this->ucl_device),
+                   UCL_READ_ONLY);
+
+  for (int i = 0; i < nelements; i++)
+    for (int j = 0; j < nelements; j++)
+      for (int k = 0; k < nelements; k++) {
+         int idx = i*nelements*nelements+j*nelements+k;
+         dview_elem2param[idx] = host_elem2param[i][j][k];
+      }
+
+  ucl_copy(elem2param,dview_elem2param,false);
+
+  UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < lj_types; i++)
+    dview_map[i] = host_map[i];
+
+  map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(map,dview_map,false);
 
   _allocated=true;
-  this->_max_bytes=0;
+  this->_max_bytes=sw1.row_bytes()+sw2.row_bytes()+sw3.row_bytes()+
+    map.row_bytes()+elem2param.row_bytes();
   return 0;
 }
 
@@ -86,6 +156,11 @@ void SWT::clear() {
     return;
   _allocated=false;
 
+  sw1.clear();
+  sw2.clear();
+  sw3.clear();
+  map.clear();
+  elem2param.clear();
   this->clear_atomic();
 }
 
@@ -121,22 +196,23 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &this->nbor->dev_nbor, 
-                   &this->_nbor_data->begin(), &this->ans->force,
-                   &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, 
-                   &this->_threads_per_atom, &sw_cut, &sw_epsilon, &sw_sigma,
-                   &sw_biga, &sw_bigb, &sw_powerp, &sw_powerq, &sw_cutsq);
+  this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3, 
+                   &map, &elem2param, &_nelements,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                   &this->ans->force, &this->ans->engv, 
+                   &eflag, &vflag, &ainum, &nbor_pitch, 
+                   &this->_threads_per_atom);
 
   BX=this->block_size();
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                            (BX/(KTHREADS*JTHREADS)))); 
   this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &this->nbor->dev_nbor, 
-                   &this->_nbor_data->begin(), &this->ans->force,
-                   &this->ans->engv, &eflag, &vflag, &ainum, 
-                   &nbor_pitch, &this->_threads_per_atom, &evatom,
-                   &sw_cut, &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
-                   &sw_costheta, &sw_cutsq);
+  this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3, 
+                           &map, &elem2param, &_nelements,
+                           &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, 
+                           &nbor_pitch, &this->_threads_per_atom, &evatom);
+
   Answer<numtyp,acctyp> *end_ans;
   #ifdef THREE_CONCURRENT
   end_ans=this->ans2;
@@ -145,20 +221,20 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   #endif
   if (evatom!=0) {
     this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &this->nbor->dev_nbor, 
-                          &this->_nbor_data->begin(), &end_ans->force,
-                          &end_ans->engv, &eflag, &vflag, &ainum, 
-                          &nbor_pitch, &this->_threads_per_atom, &sw_cut, 
-                          &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
-                          &sw_costheta, &sw_cutsq);
+    this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3, 
+                          &map, &elem2param, &_nelements, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+
   } else {
     this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &this->nbor->dev_nbor, 
-                          &this->_nbor_data->begin(), &end_ans->force,
-                          &end_ans->engv, &eflag, &vflag, &ainum, 
-                          &nbor_pitch, &this->_threads_per_atom, &sw_cut, 
-                          &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
-                          &sw_costheta, &sw_cutsq);
+    this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3, 
+                          &map, &elem2param, &_nelements, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, 
+                          &nbor_pitch, &this->_threads_per_atom);
+
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu
index f831b55b00..2408602e73 100644
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@@ -15,13 +15,24 @@
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
+
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
+texture<float4> sw1_tex;
+texture<float4> sw2_tex;
+texture<float4> sw3_tex;
 #else
 texture<int4,1> pos_tex;
+texture<int4> sw1_tex;
+texture<int4> sw2_tex;
+texture<int4> sw3_tex;
 #endif
+
 #else
 #define pos_tex x_
+#define sw1_tex sw1
+#define sw2_tex sw2
+#define sw3_tex sw3
 #endif
 
 #define THIRD (numtyp)0.66666667
@@ -60,15 +71,15 @@ texture<int4,1> pos_tex;
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
     if (eflag>0) {                                                          \
-      *engv+=energy*(acctyp)0.5;                                            \
-      engv+=inum;                                                           \
+      engv[ei]+=energy*(acctyp)0.5;                                         \
+      ei+=inum;                                                             \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv+=virial[i]*(acctyp)0.5;                                       \
-        engv+=inum;                                                         \
+        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
+        ei+=inum;                                                           \
       }                                                                     \
     }                                                                       \
     acctyp4 old=ans[ii];                                                    \
@@ -97,15 +108,15 @@ texture<int4,1> pos_tex;
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
     if (eflag>0) {                                                          \
-      *engv+=energy*(acctyp)0.5;                                            \
-      engv+=inum;                                                           \
+      engv[ei]+=energy*(acctyp)0.5;                                         \
+      ei+=inum;                                                             \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv+=virial[i]*(acctyp)0.5;                                       \
-        engv+=inum;                                                         \
+        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
+        ei+=inum;                                                           \
       }                                                                     \
     }                                                                       \
     acctyp4 old=ans[ii];                                                    \
@@ -118,34 +129,20 @@ texture<int4,1> pos_tex;
 #endif
 
 
-__kernel void k_sw(const __global numtyp4 *restrict x_, 
+__kernel void k_sw(const __global numtyp4 *restrict x_,
+                   const __global numtyp4 *restrict sw1,
+                   const __global numtyp4 *restrict sw2,
+                   const __global numtyp4 *restrict sw3,
+                   const __global int *restrict map,
+                   const __global int *restrict elem2param,
+                   const int nelements,
                    const __global int * dev_nbor, 
                    const __global int * dev_packed, 
                    __global acctyp4 *restrict ans, 
                    __global acctyp *restrict engv, 
                    const int eflag, const int vflag, const int inum, 
-                   const int nbor_pitch, const int t_per_atom, 
-                   const numtyp sw_cut, const numtyp sw_epsilon, 
-                   const numtyp sw_sigma, const numtyp sw_biga, 
-                   const numtyp sw_bigb, const numtyp sw_powerp, 
-                   const numtyp sw_powerq, const numtyp sw_cutsq) {
-
+                   const int nbor_pitch, const int t_per_atom) {
   __local int n_stride;
-  __local numtyp pre_sw_c1, pre_sw_c2, pre_sw_c3, pre_sw_c4;
-  __local numtyp pre_sw_c5, pre_sw_c6;
-  pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
-            pow(sw_sigma,sw_powerp);
-  pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
-            pow(sw_sigma,sw_powerq);
-  pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
-            pow(sw_sigma,sw_powerp+(numtyp)1.0);
-  pre_sw_c4=sw_biga*sw_epsilon*
-            pow(sw_sigma,sw_powerq+(numtyp)1.0);
-  pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
-            pow(sw_sigma,sw_powerp);
-  pre_sw_c6=sw_biga*sw_epsilon*
-            pow(sw_sigma,sw_powerq);
-
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -157,7 +154,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     const __global int *nbor, *list_end;
     int i, numj;
@@ -165,16 +162,19 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
               n_stride,list_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //int iw=ix.w;
-    //int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-
+    int itype=ix.w;
+    itype=map[itype];
+    
     for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int mtype=itype+jx.w;
+      int jtype=jx.w;
+      jtype=map[jtype];
+
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -182,7 +182,31 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
         
-      if (rsq<sw_cutsq) {
+      if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
+        numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
+        numtyp sw_epsilon=sw1_ijparam.x;
+        numtyp sw_sigma=sw1_ijparam.y;
+        numtyp4 sw2_ijparam; fetch4(sw2_ijparam,ijparam,sw2_tex);
+        numtyp sw_biga=sw2_ijparam.x;
+        numtyp sw_bigb=sw2_ijparam.y;
+        numtyp sw_powerp=sw2_ijparam.z;
+        numtyp sw_powerq=sw2_ijparam.w;
+        numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
+        numtyp sw_cut=sw3_ijparam.x;
+        numtyp sw_cutsq=sw3_ijparam.y;
+        numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
+            pow(sw_sigma,sw_powerp);
+        numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
+            pow(sw_sigma,sw_powerq);
+        numtyp pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
+            pow(sw_sigma,sw_powerp+(numtyp)1.0);
+        numtyp pre_sw_c4=sw_biga*sw_epsilon*
+            pow(sw_sigma,sw_powerq+(numtyp)1.0);
+        numtyp pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
+            pow(sw_sigma,sw_powerp);
+        numtyp pre_sw_c6=sw_biga*sw_epsilon*
+            pow(sw_sigma,sw_powerq);
+
         numtyp r=ucl_sqrt(rsq);
         numtyp rp=ucl_powr(r,-sw_powerp);
         numtyp rq=ucl_powr(r,-sw_powerq);
@@ -209,40 +233,41 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
           virial[5] += dely*delz*force;
         }
       }
-
     } // for nbor
+
     store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                   ans,engv);
   } // if ii
+
 }
 
 #define threebody(delr1x, delr1y, delr1z, eflag, energy)                     \
 {                                                                            \
   numtyp r1 = ucl_sqrt(rsq1);                                                \
   numtyp rinvsq1 = ucl_recip(rsq1);                                          \
-  numtyp rainv1 = ucl_recip(r1 - sw_cut);                                \
-  numtyp gsrainv1 = sw_sigma_gamma * rainv1;                             \
+  numtyp rainv1 = ucl_recip(r1 - sw_cut_ij);                                 \
+  numtyp gsrainv1 = sw_sigma_gamma_ij * rainv1;                              \
   numtyp gsrainvsq1 = gsrainv1*rainv1/r1;                                    \
   numtyp expgsrainv1 = ucl_exp(gsrainv1);                                    \
                                                                              \
   numtyp r2 = ucl_sqrt(rsq2);                                                \
   numtyp rinvsq2 = ucl_recip(rsq2);                                          \
-  numtyp rainv2 = ucl_recip(r2 - sw_cut);                                \
-  numtyp gsrainv2 = sw_sigma_gamma * rainv2;                             \
+  numtyp rainv2 = ucl_recip(r2 - sw_cut_ik);                                 \
+  numtyp gsrainv2 = sw_sigma_gamma_ik * rainv2;                              \
   numtyp gsrainvsq2 = gsrainv2*rainv2/r2;                                    \
   numtyp expgsrainv2 = ucl_exp(gsrainv2);                                    \
                                                                              \
   numtyp rinv12 = ucl_recip(r1*r2);                                          \
   numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12;      \
-  numtyp delcs = cs - sw_costheta;                                       \
+  numtyp delcs = cs - sw_costheta_ijk;                                       \
   numtyp delcssq = delcs*delcs;                                              \
                                                                              \
   numtyp facexp = expgsrainv1*expgsrainv2;                                   \
                                                                              \
-  numtyp facrad = sw_lambda_epsilon * facexp*delcssq;                    \
+  numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq;                    \
   numtyp frad1 = facrad*gsrainvsq1;                                          \
   numtyp frad2 = facrad*gsrainvsq2;                                          \
-  numtyp facang = sw_lambda_epsilon2 * facexp*delcs;                     \
+  numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs;                     \
   numtyp facang12 = rinv12*facang;                                           \
   numtyp csfacang = cs*facang;                                               \
   numtyp csfac1 = rinvsq1*csfacang;                                          \
@@ -273,26 +298,26 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
 {                                                                            \
   numtyp r1 = ucl_sqrt(rsq1);                                                \
   numtyp rinvsq1 = ucl_recip(rsq1);                                          \
-  numtyp rainv1 = ucl_recip(r1 - sw_cut);                                \
-  numtyp gsrainv1 = sw_sigma_gamma * rainv1;                             \
+  numtyp rainv1 = ucl_recip(r1 - sw_cut_ij);                                 \
+  numtyp gsrainv1 = sw_sigma_gamma_ij * rainv1;                              \
   numtyp gsrainvsq1 = gsrainv1*rainv1/r1;                                    \
   numtyp expgsrainv1 = ucl_exp(gsrainv1);                                    \
                                                                              \
   numtyp r2 = ucl_sqrt(rsq2);                                                \
-  numtyp rainv2 = ucl_recip(r2 - sw_cut);                                \
-  numtyp gsrainv2 = sw_sigma_gamma * rainv2;                             \
+  numtyp rainv2 = ucl_recip(r2 - sw_cut_ik);                                 \
+  numtyp gsrainv2 = sw_sigma_gamma_ik * rainv2;                              \
   numtyp expgsrainv2 = ucl_exp(gsrainv2);                                    \
                                                                              \
   numtyp rinv12 = ucl_recip(r1*r2);                                          \
   numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12;      \
-  numtyp delcs = cs - sw_costheta;                                       \
+  numtyp delcs = cs - sw_costheta_ijk;                                       \
   numtyp delcssq = delcs*delcs;                                              \
                                                                              \
   numtyp facexp = expgsrainv1*expgsrainv2;                                   \
                                                                              \
-  numtyp facrad = sw_lambda_epsilon * facexp*delcssq;                    \
+  numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq;                    \
   numtyp frad1 = facrad*gsrainvsq1;                                          \
-  numtyp facang = sw_lambda_epsilon2 * facexp*delcs;                     \
+  numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs;                     \
   numtyp facang12 = rinv12*facang;                                           \
   numtyp csfacang = cs*facang;                                               \
   numtyp csfac1 = rinvsq1*csfacang;                                          \
@@ -303,24 +328,24 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
 }
 
 __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, 
+                                const __global numtyp4 *restrict sw1,
+                                const __global numtyp4 *restrict sw2,
+                                const __global numtyp4 *restrict sw3,
+                                const __global int *restrict map,
+                                const __global int *restrict elem2param,
+                                const int nelements,
                                 const __global int * dev_nbor, 
                                 const __global int * dev_packed, 
                                 __global acctyp4 *restrict ans, 
                                 __global acctyp *restrict engv, 
                                 const int eflag, const int vflag, 
                                 const int inum,  const int nbor_pitch, 
-                                const int t_per_atom, const int evatom,
-                                const numtyp sw_cut, const numtyp sw_epsilon,
-                                const numtyp sw_sigma, const numtyp sw_lambda,
-                                const numtyp sw_gamma, const numtyp sw_costheta,
-                                const numtyp sw_cutsq) {
+                                const int t_per_atom, const int evatom) {
   __local int tpa_sq, n_stride;
-  __local numtyp sw_sigma_gamma, sw_lambda_epsilon;
-  __local numtyp sw_lambda_epsilon2;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  sw_sigma_gamma=sw_sigma*sw_gamma;
-  sw_lambda_epsilon=sw_lambda*sw_epsilon;
-  sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
+  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
+  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
+  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
@@ -344,8 +369,8 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //int iw=ix.w;
-    //int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    int itype=ix.w; 
+    itype=map[itype];
 
     for ( ; nbor_j<list_end; nbor_j+=n_stride) {
   
@@ -353,15 +378,25 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int mtype=itype+jx.w;
+      int jtype=jx.w;
+      jtype=map[jtype];
 
       // Compute r12
       numtyp delr1x = jx.x-ix.x;
       numtyp delr1y = jx.y-ix.y;
       numtyp delr1z = jx.z-ix.z;
       numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
-        
-      if (rsq1 > sw_cutsq) continue;
+
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+      numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
+
+      if (rsq1 > sw3_ijparam.y) continue;
+
+      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
+      sw_sigma=sw1_ijparam.y;
+      sw_gamma=sw1_ijparam.w;
+      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
+      sw_cut_ij=sw3_ijparam.x;
 
       const __global int *nbor_k=nbor_j-offset_j+offset_k;
       if (nbor_k<=nbor_j)
@@ -372,11 +407,31 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
         k &= NEIGHMASK;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
+        int ktype=kx.w;
+        ktype=map[ktype];
+        int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
+        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
+
         numtyp delr2x = kx.x-ix.x;
         numtyp delr2y = kx.y-ix.y;
         numtyp delr2z = kx.z-ix.z;
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
-        if (rsq2 < sw_cutsq) {
+        if (rsq2 < sw3_ikparam.y) {   // sw_cutsq=sw3[ikparam].y;
+          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
+          sw_sigma=sw1_ikparam.y;
+          sw_gamma=sw1_ikparam.w;
+          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
+          sw_cut_ik=sw3_ikparam.x;
+
+          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
+          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
+          sw_epsilon=sw1_ijkparam.x;
+          sw_lambda=sw1_ijkparam.z;
+          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
+          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
+          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
+          sw_costheta_ijk=sw3_ijkparam.z;
+
           numtyp fjx, fjy, fjz, fkx, fky, fkz;
           threebody(delr1x,delr1y,delr1z,eflag,energy);
 
@@ -403,23 +458,24 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
 }
 
 __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, 
+                             const __global numtyp4 *restrict sw1,
+                             const __global numtyp4 *restrict sw2,
+                             const __global numtyp4 *restrict sw3,
+                             const __global int *restrict map,
+                             const __global int *restrict elem2param,
+                             const int nelements,
                              const __global int * dev_nbor, 
                              const __global int * dev_packed, 
                              __global acctyp4 *restrict ans, 
                              __global acctyp *restrict engv, 
                              const int eflag, const int vflag, 
                              const int inum,  const int nbor_pitch, 
-                             const int t_per_atom, const numtyp sw_cut, 
-                             const numtyp sw_epsilon, const numtyp sw_sigma,
-                             const numtyp sw_lambda, const numtyp sw_gamma,
-                             const numtyp sw_costheta, const numtyp sw_cutsq) {
+                             const int t_per_atom) {
   __local int tpa_sq, n_stride;
-  __local numtyp sw_sigma_gamma, sw_lambda_epsilon;
-  __local numtyp sw_lambda_epsilon2;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  sw_sigma_gamma=sw_sigma*sw_gamma;
-  sw_lambda_epsilon=sw_lambda*sw_epsilon;
-  sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
+  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
+  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
+  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
@@ -443,23 +499,33 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //int iw=ix.w;
-    //int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    int itype=ix.w;
+    itype=map[itype];
 
     for ( ; nbor_j<list_end; nbor_j+=n_stride) {
       int j=*nbor_j;
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int mtype=itype+jx.w;
+      int jtype=jx.w;
+      jtype=map[jtype];
 
       // Compute r12
       numtyp delr1x = ix.x-jx.x;
       numtyp delr1y = ix.y-jx.y;
       numtyp delr1z = ix.z-jx.z;
       numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
-        
-      if (rsq1 > sw_cutsq) continue;
+
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+      numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
+
+      if (rsq1 > sw3_ijparam.y) continue;
+
+      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
+      sw_sigma=sw1_ijparam.y;
+      sw_gamma=sw1_ijparam.w;
+      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
+      sw_cut_ij=sw3_ijparam.x;
 
       const __global int *nbor_k=dev_nbor+j+nbor_pitch;
       int numk=*nbor_k; 
@@ -478,15 +544,35 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
         int k=*nbor_k;
         k &= NEIGHMASK;
 
-        if (k == i)
-          continue;
+        if (k == i) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
+        int ktype=kx.w;
+        ktype=map[ktype];
+        int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype]; 
+
         numtyp delr2x = kx.x - jx.x;
         numtyp delr2y = kx.y - jx.y;
         numtyp delr2z = kx.z - jx.z;
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
-        if (rsq2 < sw_cutsq) {
+        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
+
+        if (rsq2 < sw3_ikparam.y) {
+          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
+          sw_sigma=sw1_ikparam.y;
+          sw_gamma=sw1_ikparam.w;
+          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
+          sw_cut_ik=sw3_ikparam.x;
+
+          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
+          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
+          sw_epsilon=sw1_ijkparam.x;
+          sw_lambda=sw1_ijkparam.z;
+          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
+          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
+          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
+          sw_costheta_ijk=sw3_ijkparam.z;
+
           numtyp fjx, fjy, fjz;
           //if (evatom==0) {
             threebody_half(delr1x,delr1y,delr1z);
@@ -512,24 +598,25 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, 
+__kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
+                             const __global numtyp4 *restrict sw1,
+                             const __global numtyp4 *restrict sw2,
+                             const __global numtyp4 *restrict sw3,
+                             const __global int *restrict map,
+                             const __global int *restrict elem2param,
+                             const int nelements,
                              const __global int * dev_nbor, 
                              const __global int * dev_packed, 
                              __global acctyp4 *restrict ans, 
                              __global acctyp *restrict engv, 
                              const int eflag, const int vflag, 
                              const int inum,  const int nbor_pitch, 
-                             const int t_per_atom, const numtyp sw_cut, 
-                             const numtyp sw_epsilon, const numtyp sw_sigma,
-                             const numtyp sw_lambda, const numtyp sw_gamma,
-                             const numtyp sw_costheta, const numtyp sw_cutsq) {
+                             const int t_per_atom) {
   __local int tpa_sq, n_stride;
-  __local numtyp sw_sigma_gamma, sw_lambda_epsilon;
-  __local numtyp sw_lambda_epsilon2;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  sw_sigma_gamma=sw_sigma*sw_gamma;
-  sw_lambda_epsilon=sw_lambda*sw_epsilon;
-  sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
+  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
+  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
+  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
@@ -553,26 +640,36 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //int iw=ix.w;
-    //int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    int itype=ix.w;
+    itype=map[itype];
 
     for ( ; nbor_j<list_end; nbor_j+=n_stride) {
       int j=*nbor_j;
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int mtype=itype+jx.w;
+      int jtype=jx.w;
+      jtype=map[jtype];
 
       // Compute r12
       numtyp delr1x = ix.x-jx.x;
       numtyp delr1y = ix.y-jx.y;
       numtyp delr1z = ix.z-jx.z;
       numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
-        
-      if (rsq1 > sw_cutsq) continue;
 
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+      numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
+
+      if (rsq1 > sw3_ijparam.y) continue;
+
+      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
+      sw_sigma=sw1_ijparam.y;
+      sw_gamma=sw1_ijparam.w;
+      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
+      sw_cut_ij=sw3_ijparam.x;
+        
       const __global int *nbor_k=dev_nbor+j+nbor_pitch;
-      int numk=*nbor_k; 
+      int numk=*nbor_k;
       if (dev_nbor==dev_packed) {
         nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
         k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
@@ -588,15 +685,35 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
         int k=*nbor_k;
         k &= NEIGHMASK;
 
-        if (k == i)
-          continue;
+        if (k == i) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
+        int ktype=kx.w;
+        ktype=map[ktype];
+        int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype]; 
+        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
+
         numtyp delr2x = kx.x - jx.x;
         numtyp delr2y = kx.y - jx.y;
         numtyp delr2z = kx.z - jx.z;
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
-        if (rsq2 < sw_cutsq) {
+
+        if (rsq2 < sw3_ikparam.y) {
+          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
+          sw_sigma=sw1_ikparam.y;
+          sw_gamma=sw1_ikparam.w;
+          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
+          sw_cut_ik=sw3_ikparam.x;
+
+          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
+          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
+          sw_epsilon=sw1_ijkparam.x;
+          sw_lambda=sw1_ijkparam.z;
+          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
+          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
+          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
+          sw_costheta_ijk=sw3_ijkparam.z;
+
           numtyp fjx, fjy, fjz, fkx, fky, fkz;
           threebody(delr1x,delr1y,delr1z,eflag,energy);
 
diff --git a/lib/gpu/lal_sw.h b/lib/gpu/lal_sw.h
index 82fc7d24e0..66b36a90b0 100644
--- a/lib/gpu/lal_sw.h
+++ b/lib/gpu/lal_sw.h
@@ -37,13 +37,14 @@ class SW : public BaseThree<numtyp, acctyp> {
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int nlocal, const int nall, const int max_nbors, 
+  int init(const int ntypes, const int nlocal, const int nall, const int max_nbors, 
            const double cell_size, const double gpu_split, FILE *screen,
-           const double epsilon, const double sigma,
-           const double lambda, const double gamma,
-           const double costheta, const double biga,
-           const double bigb, const double powerp,
-           const double powerq, const double cut, const double cutsq);
+           int* host_map, const int nelements, int*** host_elem2param, const int nparams, 
+           const double* epsilon, const double* sigma,
+           const double* lambda, const double* gamma,
+           const double* costheta, const double* biga,
+           const double* bigb, const double* powerp,
+           const double* powerq, const double* cut, const double* cutsq);
            
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
@@ -60,11 +61,26 @@ class SW : public BaseThree<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
+  /// Number of atom types   
+  int _lj_types;
+
+  /// sw1.x = epsilon, sw1.y = sigma, sw1.z = lambda, sw1.w = gamma
+  UCL_D_Vec<numtyp4> sw1;
+  /// sw2.x = biga, sw2.y = bigb, sw2.z = powerp, sw2.w = powerq
+  UCL_D_Vec<numtyp4> sw2;
+  /// sw3.x = cut, sw3.y = cutsq, sw3.z = costheta
+  UCL_D_Vec<numtyp4> sw3;
+
+  UCL_D_Vec<int> elem2param;
+  UCL_D_Vec<int> map;
+  int _nparams,_nelements;
+
+  UCL_Texture sw1_tex, sw2_tex, sw3_tex;
+
  private:
   bool _allocated;
   void loop(const bool _eflag, const bool _vflag, const int evatom);
-  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta;
-  numtyp sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq;
+
 };
 
 }
diff --git a/lib/gpu/lal_sw_ext.cpp b/lib/gpu/lal_sw_ext.cpp
index 33abf073d6..16b6828325 100644
--- a/lib/gpu/lal_sw_ext.cpp
+++ b/lib/gpu/lal_sw_ext.cpp
@@ -27,14 +27,15 @@ static SW<PRECISION,ACC_PRECISION> SWMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int sw_gpu_init(const int inum, const int nall, const int max_nbors, 
+int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, 
                 const double cell_size, int &gpu_mode, FILE *screen,
-                const double sw_epsilon, const double sw_sigma,
-                const double sw_lambda, const double sw_gamma,
-                const double sw_costheta, const double sw_biga,
-                const double sw_bigb, const double sw_powerp,
-                const double sw_powerq, const double sw_cut, 
-                const double sw_cutsq) {
+                int* host_map, const int nelements, int*** host_elem2param, const int nparams,
+                const double* sw_epsilon, const double* sw_sigma,
+                const double* sw_lambda, const double* sw_gamma,
+                const double* sw_costheta, const double* sw_biga,
+                const double* sw_bigb, const double* sw_powerp,
+                const double* sw_powerq, const double* sw_cut, 
+                const double* sw_cutsq) {
   SWMF.clear();
   gpu_mode=SWMF.device->gpu_mode();
   double gpu_split=SWMF.device->particle_split();
@@ -55,13 +56,14 @@ int sw_gpu_init(const int inum, const int nall, const int max_nbors,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
+    init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+                      host_map, nelements, host_elem2param, nparams,
                       sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, 
                       sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
 
@@ -72,14 +74,15 @@ int sw_gpu_init(const int inum, const int nall, const int max_nbors,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
+      init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+                        host_map, nelements, host_elem2param, nparams,
                         sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, 
                         sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, 
                         sw_cutsq);