git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10668 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2013-08-23 14:46:18 +00:00
parent 402d1a8605
commit 22751a2aae
19 changed files with 2426 additions and 0 deletions
--- a/lib/gpu/Makefile.lammps.mac_ocl
+++ b/lib/gpu/Makefile.lammps.mac_ocl
@ -0,0 +1,5 @@
 # Settings that the LAMMPS build will import when this package library is used
 gpu_SYSINC =
 gpu_SYSLIB = -framework OpenCL
 gpu_SYSPATH = 
--- a/lib/gpu/Makefile.lammps.opencl
+++ b/lib/gpu/Makefile.lammps.opencl
@ -0,0 +1,5 @@
 # Settings that the LAMMPS build will import when this package library is used
 gpu_SYSINC =
 gpu_SYSLIB = -lOpenCL
 gpu_SYSPATH = 
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@ -0,0 +1,358 @@
 /***************************************************************************
                               base_three.cpp
                             -------------------
                            W. Michael Brown (ORNL)
  Base class for pair styles with per-particle data for position and type
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : Tue April 2, 2013
    email                : brownw@ornl.gov
 ***************************************************************************/
 #include "lal_base_three.h"
 using namespace LAMMPS_AL;
 #define BaseThreeT BaseThree<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> global_device;
 template <class numtyp, class acctyp>
 BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0)  {
  device=&global_device;
  ans=new Answer<numtyp,acctyp>();
  nbor=new Neighbor();
  #ifdef THREE_CONCURRENT
  ans2=new Answer<numtyp,acctyp>();
  #endif
 }
 template <class numtyp, class acctyp>
 BaseThreeT::~BaseThree() {
  delete ans;
  delete nbor;
  #ifdef THREE_CONCURRENT
  delete ans2;
  #endif
 }
 template <class numtyp, class acctyp>
 int BaseThreeT::bytes_per_atom_atomic(const int max_nbors) const {
  int b=device->atom.bytes_per_atom()+ans->bytes_per_atom()+
         nbor->bytes_per_atom(max_nbors);
  #ifdef THREE_CONCURRENT
  b+=ans2->bytes_per_atom();
  #endif
  return b;     
 }
 template <class numtyp, class acctyp>
 int BaseThreeT::init_three(const int nlocal, const int nall,
                           const int max_nbors, const int maxspecial,
                           const double cell_size, const double gpu_split,
                           FILE *_screen, const void *pair_program,
                           const char *k_two, const char *k_three_center,
                           const char *k_three_end) {
  screen=_screen;
  int gpu_nbor=0;
  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
    gpu_nbor=1;
  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
    gpu_nbor=2;
  int _gpu_host=0;
  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
  if (host_nlocal>0)
    _gpu_host=1;
  _threads_per_atom=device->threads_per_atom();
  if (_threads_per_atom>1 && gpu_nbor==0) {
    nbor->packing(true);
    _nbor_data=&(nbor->dev_packed);
  } else
    _nbor_data=&(nbor->dev_nbor);
  if (_threads_per_atom*_threads_per_atom>device->warp_size())
    return -10;
  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
                           maxspecial,_gpu_host,max_nbors,cell_size,false,
                           _threads_per_atom);
  if (success!=0)
    return success;
  ucl_device=device->gpu;
  atom=&device->atom;
  #ifdef THREE_CONCURRENT
  _end_command_queue=ucl_device->num_queues();
  ucl_device->push_command_queue();
  if (!ans2->init(ans->max_inum(),false,false,*(device->gpu)))
    return -3;
  ans2->cq(_end_command_queue);
  #endif
  _block_pair=device->pair_block_size();
  _block_size=device->block_ellipse();
  compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);
  // Initialize timers for the selected GPU
  time_pair.init(*ucl_device);
  time_pair.zero();
  pos_tex.bind_float(atom->x,4);
  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  #ifdef THREE_CONCURRENT
  _max_an_bytes+=ans2->gpu_bytes();
  #endif
  return 0;
 }
 template <class numtyp, class acctyp>
 void BaseThreeT::estimate_gpu_overhead() {
  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
 }
 template <class numtyp, class acctyp>
 void BaseThreeT::clear_atomic() {
  // Output any timing information
  acc_timers();
  double avg_split=hd_balancer.all_avg_split();
  _gpu_overhead*=hd_balancer.timestep();
  _driver_overhead*=hd_balancer.timestep();
  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
  if (_compiled) {
    k_three_center.clear();
    k_three_end.clear();
    k_three_end_vatom.clear();
    k_pair.clear();
    delete pair_program;
    _compiled=false;
  }
  time_pair.clear();
  hd_balancer.clear();
  nbor->clear();
  ans->clear();
  #ifdef THREE_CONCURRENT
  ans2->clear();
  assert(ucl_device->num_queues()==_end_command_queue+1);
  ucl_device->pop_command_queue();
  #endif
  device->clear();
 }
 // ---------------------------------------------------------------------------
 // Copy neighbor list from host
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
                              int *ilist, int *numj, int **firstneigh, 
                              bool &success) {
  success=true;
  int mn=nbor->max_nbor_loop(nlist,numj,ilist);
  resize_atom(inum,nall,success);
  resize_local(nall,mn,success);
  if (!success)
    return NULL;
  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  #ifdef THREE_CONCURRENT
  bytes+=ans2->gpu_bytes();
  #endif
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
  return ilist;
 }
 // ---------------------------------------------------------------------------
 // Build neighbor list on device
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline void BaseThreeT::build_nbor_list(const int inum, const int host_inum,
                                         const int nall, double **host_x,
                                         int *host_type, double *sublo,
                                         double *subhi, int *tag,
                                         int **nspecial, int **special,
                                         bool &success) {
  success=true;
  resize_atom(inum,nall,success);
  resize_local(nall,host_inum,nbor->max_nbors(),success);
  if (!success)
    return;
  atom->cast_copy_x(host_x,host_type);
  int mn;
  nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
                        nspecial, special, success, mn);
  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  #ifdef THREE_CONCURRENT
  bytes+=ans2->gpu_bytes();
  #endif
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
 }
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall, 
                         const int nlist, double **host_x, int *host_type,
                         int *ilist, int *numj, int **firstneigh, 
                         const bool eflag, const bool vflag, const bool eatom,
                         const bool vatom, int &host_start, 
                         const double cpu_time, bool &success) {
  acc_timers();
  if (nlist==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    resize_atom(0,nall,success);
    zero_timers();
    return;
  }
  int ago=hd_balancer.ago_first(f_ago);
  int inum=hd_balancer.balance(ago,nlocal,cpu_time);
  ans->inum(inum);
  #ifdef THREE_CONCURRENT
  ans2->inum(inum);
  #endif
  host_start=inum;
  if (ago==0) {
    reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
    if (!success)
      return;
  }
  atom->cast_x_data(host_x,host_type);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);
  int evatom=0;
  if (eatom || vatom)
    evatom=1;
  #ifdef THREE_CONCURRENT
  ucl_device->sync();
  #endif
  loop(eflag,vflag,evatom);
  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
  device->add_ans_object(ans);
  #ifdef THREE_CONCURRENT
  ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
  device->add_ans_object(ans2);
  #endif
  hd_balancer.stop_timer();
 }
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int ** BaseThreeT::compute(const int ago, const int inum_full,
                                 const int nall, double **host_x, int *host_type,
                                 double *sublo, double *subhi, int *tag,
                                 int **nspecial, int **special, const bool eflag, 
                                 const bool vflag, const bool eatom,
                                 const bool vatom, int &host_start,
                                 int **ilist, int **jnum,
                                 const double cpu_time, bool &success) {
  acc_timers();
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    resize_atom(0,nall,success);
    zero_timers();
    return NULL;
  }
  hd_balancer.balance(cpu_time);
  int inum=hd_balancer.get_gpu_count(ago,inum_full);
  ans->inum(inum);
  #ifdef THREE_CONCURRENT
  ans2->inum(inum);
  #endif
  host_start=inum;
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return NULL;
    hd_balancer.start_timer();
  } else {
    atom->cast_x_data(host_x,host_type);
    hd_balancer.start_timer();
    atom->add_x_data(host_x,host_type);
  }
  *ilist=nbor->host_ilist.begin();
  *jnum=nbor->host_acc.begin();
  int evatom=0;
  if (eatom || vatom)
    evatom=1;
  #ifdef THREE_CONCURRENT
  ucl_device->sync();
  #endif
  loop(eflag,vflag,evatom);
  ans->copy_answers(eflag,vflag,eatom,vatom);
  device->add_ans_object(ans);
  #ifdef THREE_CONCURRENT
  ans2->copy_answers(eflag,vflag,eatom,vatom);
  device->add_ans_object(ans2);
  #endif
  hd_balancer.stop_timer();
  return nbor->host_jlist.begin()-host_start;
 }
 template <class numtyp, class acctyp>
 double BaseThreeT::host_memory_usage_atomic() const {
  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
         4*sizeof(numtyp)+sizeof(BaseThree<numtyp,acctyp>);
 }
 template <class numtyp, class acctyp>
 void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                 const char *ktwo, const char *kthree_center,
                                 const char *kthree_end) {
  if (_compiled)
    return;
  std::string vatom_name=std::string(kthree_end)+"_vatom";
  pair_program=new UCL_Program(dev);
  pair_program->load_string(pair_str,device->compile_string().c_str());
  k_three_center.set_function(*pair_program,kthree_center);
  k_three_end.set_function(*pair_program,kthree_end);
  k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
  k_pair.set_function(*pair_program,ktwo);
  pos_tex.get_texture(*pair_program,"pos_tex");
  #ifdef THREE_CONCURRENT
  k_three_end.cq(ucl_device->cq(_end_command_queue));
  k_three_end_vatom.cq(ucl_device->cq(_end_command_queue));
  #endif
  _compiled=true;
 }
 template class BaseThree<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@ -0,0 +1,221 @@
 /***************************************************************************
                                base_three.h
                             -------------------
                            W. Michael Brown (ORNL)
  Base class for 3-body potentials
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : Tue April 2, 2013
    email                : brownw@ornl.gov
 ***************************************************************************/
 #ifndef LAL_BASE_ATOMIC_H
 #define LAL_BASE_ATOMIC_H
 #include "lal_device.h"
 #include "lal_balance.h"
 #include "mpi.h"
 #if defined(USE_OPENCL)
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class BaseThree {
 public:
  BaseThree();
  virtual ~BaseThree();
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * \param k_two name for the kernel for 2-body force calculation
    * \param k_three name for the kernel for 3-body force calculation
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card
    * - -10 if invalid thread_per_atom setting **/
  int init_three(const int nlocal, const int nall, const int max_nbors,
                 const int maxspecial, const double cell_size, 
                 const double gpu_split, FILE *screen, 
                 const void *pair_program, const char *k_two,
                 const char *k_three_center, const char *k_three_end);
  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead();
  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
    if (atom->resize(nall, success))
      pos_tex.bind_float(atom->x,4);
    ans->resize(inum,success);
    #ifdef THREE_CONCURRENT
    ans2->resize(inum,success);
    #endif
  }
  /// Check if there is enough storage for neighbors and realloc if not
  /** \param nlocal number of particles whose nbors must be stored on device
    * \param host_inum number of particles whose nbors need to copied to host
    * \param current maximum number of neighbors
    * \note olist_size=total number of local particles **/
  inline void resize_local(const int inum, const int max_nbors, bool &success) {
    nbor->resize(inum,max_nbors,success);
  }
  /// Check if there is enough storage for neighbors and realloc if not
  /** \param nlocal number of particles whose nbors must be stored on device
    * \param host_inum number of particles whose nbors need to copied to host
    * \param current maximum number of neighbors
    * \note host_inum is 0 if the host is performing neighboring
    * \note nlocal+host_inum=total number local particles
    * \note olist_size=0 **/
  inline void resize_local(const int inum, const int host_inum, 
                           const int max_nbors, bool &success) {
    nbor->resize(inum,host_inum,max_nbors,success);
  }
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear_atomic();
  /// Returns memory usage on device per atom
  int bytes_per_atom_atomic(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage_atomic() const;
  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
      nbor->acc_timers();
      time_pair.add_to_total();
      atom->acc_timers();
      ans->acc_timers();
      #ifdef THREE_CONCURRENT
      ans2->acc_timers();
      #endif
    }
  }
  /// Zero timers
  inline void zero_timers() {
    time_pair.zero();
    atom->zero_timers();
    ans->zero_timers();
    #ifdef THREE_CONCURRENT
    ans2->zero_timers();
    #endif
  }
  /// Copy neighbor list from host
  int * reset_nbors(const int nall, const int inum, const int nlist, int *ilist,
                    int *numj, int **firstneigh, bool &success);
  /// Build neighbor list on device
  void build_nbor_list(const int inum, const int host_inum,
                       const int nall, double **host_x, int *host_type,
                       double *sublo, double *subhi, int *tag, int **nspecial, 
                       int **special, bool &success);
  /// Pair loop with host neighboring
  void compute(const int f_ago, const int inum_full, const int nall, 
               const int nlist, double **host_x, int *host_type,
               int *ilist, int *numj, int **firstneigh, const bool eflag,
               const bool vflag, const bool eatom, const bool vatom,
               int &host_start, const double cpu_time, bool &success);
  /// Pair loop with device neighboring
  int * compute(const int ago, const int inum_full, const int nall, 
                double **host_x, int *host_type, double *sublo,
                double *subhi, int *tag, int **nspecial,
                int **special, const bool eflag, const bool vflag, 
                const bool eatom, const bool vatom, int &host_start, 
                const double cpu_time, bool &success);
  /// Pair loop with device neighboring
  int ** compute(const int ago, const int inum_full,
                 const int nall, double **host_x, int *host_type, double *sublo,
                 double *subhi, int *tag, int **nspecial,
                 int **special, const bool eflag, const bool vflag, 
                 const bool eatom, const bool vatom, int &host_start, 
                 int **ilist, int **numj, const double cpu_time, bool &success);
  // -------------------------- DEVICE DATA ------------------------- 
  /// Device Properties and Atom and Neighbor storage
  Device<numtyp,acctyp> *device;
  /// Geryon device
  UCL_Device *ucl_device;
  /// Device Timers
  UCL_Timer time_pair;
  /// Host device load balancer
  Balance<numtyp,acctyp> hd_balancer;
  /// LAMMPS pointer for screen output
  FILE *screen;
  // --------------------------- ATOM DATA --------------------------
  /// Atom Data
  Atom<numtyp,acctyp> *atom;
  // ------------------------ FORCE/ENERGY DATA -----------------------
  Answer<numtyp,acctyp> *ans;
  #ifdef THREE_CONCURRENT
  Answer<numtyp,acctyp> *ans2;
  #endif  
  // --------------------------- NBOR DATA ----------------------------
  /// Neighbor data
  Neighbor *nbor;
  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *pair_program;
  UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
  inline int block_pair() { return _block_pair; }
  inline int block_size() { return _block_size; }
  // --------------------------- TEXTURES -----------------------------
  UCL_Texture pos_tex;
 protected:
  bool _compiled;
  int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
  double _max_bytes, _max_an_bytes;
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;
  void compile_kernels(UCL_Device &dev, const void *pair_string, 
                       const char *k_two, const char *k_three_center,
                       const char *k_three_end);
  virtual void loop(const bool _eflag, const bool _vflag, 
                    const int evatom) = 0;
 };
 }
 #endif
--- a/lib/gpu/lal_beck.cpp
+++ b/lib/gpu/lal_beck.cpp
@ -0,0 +1,152 @@
 /***************************************************************************
                                   beck.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the beck pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifdef USE_OPENCL
 #include "beck_cl.h"
 #elif defined(USE_CUDART)
 const char *beck=0;
 #else
 #include "beck_cubin.h"
 #endif
 #include "lal_beck.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define BeckT Beck<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 BeckT::Beck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 template <class numtyp, class acctyp>
 BeckT::~Beck() { 
  clear();
 }
 template <class numtyp, class acctyp>
 int BeckT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int BeckT::init(const int ntypes, 
                double **host_cutsq, double **host_aa,
                double **host_alpha, double **host_beta,
                double **host_AA, double **host_BB,
                double *host_special_lj, const int nlocal,
                const int nall, const int max_nbors,
                const int maxspecial, const double cell_size,
                const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,beck,"k_beck");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_ONLY);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  beck1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,beck1,host_write,host_aa,host_alpha,
                         host_beta);
  beck2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,beck2,host_write,host_AA,host_BB,
                         host_cutsq);
  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
  dview.view(host_special_lj,4,*(this->ucl_device));
  ucl_copy(sp_lj,dview,false);
  _allocated=true;
  this->_max_bytes=beck1.row_bytes()+beck2.row_bytes()+sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void BeckT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  beck1.clear();
  beck2.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double BeckT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(Beck<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BeckT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &beck1, &beck2, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &beck1, &beck2, &_lj_types, &sp_lj,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class Beck<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_beck.h
+++ b/lib/gpu/lal_beck.h
@ -0,0 +1,80 @@
 /***************************************************************************
                                    beck.h
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the beck pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_BECK_H
 #define LAL_BECK_H
 #include "lal_base_atomic.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class Beck : public BaseAtomic<numtyp, acctyp> {
 public:
  Beck();
  ~Beck(); 
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
           double **host_aa, double **host_alpha,
           double **host_beta, double **host_AA,
           double **host_BB, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// beck1.x = aa, beck1.y = alpha, beck1.z = beta
  UCL_D_Vec<numtyp4> beck1;
  /// beck2.x = AA, beck2.y = BB, beck2.z = cutsq
  UCL_D_Vec<numtyp4> beck2;
  /// Special LJ values
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_beck_ext.cpp
+++ b/lib/gpu/lal_beck_ext.cpp
@ -0,0 +1,120 @@
 /***************************************************************************
                                 beck_ext.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to beck acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_beck.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static Beck<PRECISION,ACC_PRECISION> BLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
                  double **alpha, double **beta, double **AA, double **BB,
                  double *special_lj, const int inum, const int nall,
                  const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen) {
  BLMF.clear();
  gpu_mode=BLMF.device->gpu_mode();
  double gpu_split=BLMF.device->particle_split();
  int first_gpu=BLMF.device->first_device();
  int last_gpu=BLMF.device->last_device();
  int world_me=BLMF.device->world_me();
  int gpu_rank=BLMF.device->gpu_rank();
  int procs_per_gpu=BLMF.device->procs_per_gpu();
  BLMF.device->init_message(screen,"beck",first_gpu,last_gpu);
  bool message=false;
  if (BLMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta,
                      AA, BB, special_lj, inum, nall, 300,
                      maxspecial, cell_size, gpu_split, screen);
  BLMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, AA, BB,
                        special_lj, inum, nall, 300, maxspecial,
                        cell_size, gpu_split, screen);
    BLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    BLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void beck_gpu_clear() {
  BLMF.clear();
 }
 int ** beck_gpu_compute_n(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           double *sublo, double *subhi, int *tag, int **nspecial,
                           int **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
                           bool &success) {
  return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                      subhi, tag, nspecial, special, eflag, vflag, eatom,
                      vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 void beck_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success) {
  BLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
               firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 double beck_gpu_bytes() {
  return BLMF.host_memory_usage();
 }
--- a/lib/gpu/lal_lj_coul_msm.cpp
+++ b/lib/gpu/lal_lj_coul_msm.cpp
@ -0,0 +1,200 @@
 /***************************************************************************
                               lj_coul_msm.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the lj/cut/coul/msm pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : brownw@ornl.gov
 ***************************************************************************/
 #if defined(USE_OPENCL)
 #include "lj_coul_msm_cl.h"
 #elif defined(USE_CUDART)
 const char *lj_coul_msm=0;
 #else
 #include "lj_coul_msm_cubin.h"
 #endif
 #include "lal_lj_coul_msm.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define LJCoulMSMT LJCoulMSM<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 LJCoulMSMT::LJCoulMSM() : BaseCharge<numtyp,acctyp>(),
                                    _allocated(false) {
 }
 template <class numtyp, class acctyp>
 LJCoulMSMT::~LJCoulMSM() {
  clear();
 }
 template <class numtyp, class acctyp>
 int LJCoulMSMT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int LJCoulMSMT::init(const int ntypes,
                     double **host_cutsq, double **host_lj1, 
                     double **host_lj2, double **host_lj3, 
                     double **host_lj4, double **host_gcons,
                     double **host_dgcons, double **host_offset,
                     double *host_special_lj, const int nlocal,
                     const int nall, const int max_nbors,
                     const int maxspecial, const double cell_size,
                     const double gpu_split, FILE *_screen,
                     double **host_cut_ljsq, const double host_cut_coulsq,
                     double *host_special_coul, const int order,
                     const double qqrd2e) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,lj_coul_msm,"k_lj_coul_msm");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_ONLY);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
 			 host_cutsq, host_cut_ljsq);
  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
 		         host_offset);
  // pack gcons and dgcons
  int nrows, ncols;
  nrows = 7;
  ncols = 7;
  UCL_H_Vec<numtyp> dview_gcons(nrows*ncols,*(this->ucl_device),
                                UCL_WRITE_ONLY);
  for (int ix=0; ix<nrows; ix++)
    for (int iy=0; iy<ncols; iy++)
      dview_gcons[ix*ncols+iy]=host_gcons[ix][iy];
  gcons.alloc(nrows*ncols,*(this->ucl_device),UCL_READ_ONLY);
  ucl_copy(gcons,dview_gcons,false);
  gcons_tex.get_texture(*(this->pair_program),"gcons_tex");
  gcons_tex.bind_float(gcons,1);
  nrows = 7;
  ncols = 6;
  UCL_H_Vec<numtyp> dview_dgcons(nrows*ncols,*(this->ucl_device),
                                 UCL_WRITE_ONLY);
  for (int ix=0; ix<nrows; ix++)
    for (int iy=0; iy<ncols; iy++)
      dview_dgcons[ix*ncols+iy]=host_dgcons[ix][iy];
  dgcons.alloc(nrows*ncols,*(this->ucl_device),UCL_READ_ONLY);
  ucl_copy(dgcons,dview_dgcons,false);
  dgcons_tex.get_texture(*(this->pair_program),"dgcons_tex");
  dgcons_tex.bind_float(dgcons,1);
  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_lj[i];
    host_write[i+4]=host_special_coul[i];
  }
  ucl_copy(sp_lj,host_write,8,false);
  _cut_coulsq=host_cut_coulsq;
  _qqrd2e=qqrd2e;
  _order=order;
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+
    gcons.row_bytes()+dgcons.row_bytes()+sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void LJCoulMSMT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  lj1.clear();
  lj3.clear();
  gcons.clear();
  dgcons.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double LJCoulMSMT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(LJCoulMSM<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj, 
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag,
                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                          &_cut_coulsq, &_qqrd2e, &_order,
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons,
                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
                     &this->_nbor_data->begin(), &this->ans->force,
                     &this->ans->engv, &eflag, &vflag, &ainum,
                     &nbor_pitch, &this->atom->q, &_cut_coulsq,
                     &_qqrd2e, &_order, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class LJCoulMSM<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_lj_coul_msm.h
+++ b/lib/gpu/lal_lj_coul_msm.h
@ -0,0 +1,88 @@
 /***************************************************************************
                               lj_coul_msm.h
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the lj/cut/coul/msm pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_LJ_COUL_MSM_H
 #define LAL_LJ_COUL_MSM_H
 #include "lal_base_charge.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
 public:
  LJCoulMSM();
  ~LJCoulMSM();
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
           double **host_lj1, double **host_lj2, double **host_lj3,
           double **host_lj4, double **host_gcons, double **host_dgcons,
           double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen, double **host_cut_ljsq,
           const double host_cut_coulsq, double *host_special_coul,
           const int order, const double qqrd2e);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw
  UCL_D_Vec<numtyp4> lj1;
  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
  UCL_D_Vec<numtyp4> lj3;
  /// Special LJ values [0-3] and Special Coul values [4-7]
  UCL_D_Vec<numtyp> sp_lj;
  UCL_D_Vec<numtyp> gcons, dgcons;
  UCL_Texture gcons_tex, dgcons_tex;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
  numtyp _cut_coulsq, _qqrd2e;
  int _order;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_lj_coul_msm_ext.cpp
+++ b/lib/gpu/lal_lj_coul_msm_ext.cpp
@ -0,0 +1,131 @@
 /***************************************************************************
                            lj_coul_msm_ext.cpp
                             -------------------
                            W. Michael Brown (ORNL)
  Functions for LAMMPS access to lj/cut/coul/msm acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : brownw@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_lj_coul_msm.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static LJCoulMSM<PRECISION,ACC_PRECISION> LJCMLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                 double **host_lj2, double **host_lj3, double **host_lj4,
                 double **host_gcons, double **host_dgcons,
                 double **offset, double *special_lj, const int inum,
                 const int nall, const int max_nbors, const int maxspecial,
                 const double cell_size, int &gpu_mode, FILE *screen,
                 double **host_cut_ljsq, double host_cut_coulsq,
                 double *host_special_coul, const int order, const double qqrd2e) {
  LJCMLMF.clear();
  gpu_mode=LJCMLMF.device->gpu_mode();
  double gpu_split=LJCMLMF.device->particle_split();
  int first_gpu=LJCMLMF.device->first_device();
  int last_gpu=LJCMLMF.device->last_device();
  int world_me=LJCMLMF.device->world_me();
  int gpu_rank=LJCMLMF.device->gpu_rank();
  int procs_per_gpu=LJCMLMF.device->procs_per_gpu();
  LJCMLMF.device->init_message(screen,"lj/cut/coul/msm",first_gpu,last_gpu);
  bool message=false;
  if (LJCMLMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                        host_gcons, host_dgcons, offset,
                        special_lj, inum, nall, 300, maxspecial,
                        cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, order, qqrd2e);
  LJCMLMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                          host_gcons, host_dgcons, offset,
                          special_lj, inum, nall, 300, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, order, qqrd2e);
    LJCMLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    LJCMLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void ljcm_gpu_clear() {
  LJCMLMF.clear();
 }
 int** ljcm_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum,  const double cpu_time,
                         bool &success, double *host_q, double *boxlo,
                         double *prd) {
  return LJCMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
 }  
 void ljcm_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success, double *host_q,
                      const int nlocal, double *boxlo, double *prd) {
  LJCMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                host_q,nlocal,boxlo,prd);
 }
 double ljcm_gpu_bytes() {
  return LJCMLMF.host_memory_usage();
 }
--- a/lib/gpu/lal_mie.cpp
+++ b/lib/gpu/lal_mie.cpp
@ -0,0 +1,152 @@
 /***************************************************************************
                                   mie.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the mie pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifdef USE_OPENCL
 #include "mie_cl.h"
 #elif defined(USE_CUDART)
 const char *mie=0;
 #else
 #include "mie_cubin.h"
 #endif
 #include "lal_mie.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define MieT Mie<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 MieT::Mie() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 template <class numtyp, class acctyp>
 MieT::~Mie() { 
  clear();
 }
 template <class numtyp, class acctyp>
 int MieT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int MieT::init(const int ntypes, double **host_cutsq, 
               double **host_mie1, double **host_mie2,
               double **host_mie3, double **host_mie4,
               double **host_gamA, double **host_gamR,
               double **host_offset, double *host_special_lj,
               const int nlocal, const int nall, const int max_nbors,
               const int maxspecial, const double cell_size,
               const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,mie,"k_mie");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_ONLY);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  mie1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,mie1,host_write,host_mie1,host_mie2,
 			                   host_gamA,host_gamR);
  mie3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,mie3,host_write,host_mie3,host_mie4,
 			                   host_offset,host_cutsq);
  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
  dview.view(host_special_lj,4,*(this->ucl_device));
  ucl_copy(sp_lj,dview,false);
  _allocated=true;
  this->_max_bytes=mie1.row_bytes()+mie3.row_bytes()+sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void MieT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  mie1.clear();
  mie3.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double MieT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(Mie<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void MieT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &mie1, &mie3, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &mie1, &mie3, &_lj_types, &sp_lj,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class Mie<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_mie.h
+++ b/lib/gpu/lal_mie.h
@ -0,0 +1,80 @@
 /***************************************************************************
                                    mie.h
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the mie/cut pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_MIE_H
 #define LAL_MIE_H
 #include "lal_base_atomic.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class Mie : public BaseAtomic<numtyp, acctyp> {
 public:
  Mie();
  ~Mie(); 
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
           double **host_mie1, double **host_mie2, double **host_mie3,
           double **host_mie4, double **host_gamA, double **host_gamR,
           double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// mie1.x = mie1, mie1.y = mie2, mie1.z = gamA, mie1.w = gamR
  UCL_D_Vec<numtyp4> mie1;
  /// mie3.x = mie3, mie3.y = mie4, mie3.z = offset, mie3.w = cutsq
  UCL_D_Vec<numtyp4> mie3;
  /// Special Mie values
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_mie_ext.cpp
+++ b/lib/gpu/lal_mie_ext.cpp
@ -0,0 +1,124 @@
 /***************************************************************************
                                 mie_ext.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to mie acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_mie.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static Mie<PRECISION,ACC_PRECISION> MLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
                 double **host_mie2, double **host_mie3, double **host_mie4,
                 double **host_gamA, double **host_gamR,
                 double **offset, double *special_lj,
                 const int inum, const int nall, const int max_nbors,
                 const int maxspecial,
                 const double cell_size, int &gpu_mode, FILE *screen) {
  MLMF.clear();
  gpu_mode=MLMF.device->gpu_mode();
  double gpu_split=MLMF.device->particle_split();
  int first_gpu=MLMF.device->first_device();
  int last_gpu=MLMF.device->last_device();
  int world_me=MLMF.device->world_me();
  int gpu_rank=MLMF.device->gpu_rank();
  int procs_per_gpu=MLMF.device->procs_per_gpu();
  MLMF.device->init_message(screen,"mie",first_gpu,last_gpu);
  bool message=false;
  if (MLMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2,
                      host_mie3, host_mie4, host_gamA, host_gamR,
                      offset, special_lj, inum, nall, 300,
                      maxspecial, cell_size, gpu_split, screen);
  MLMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2,
                        host_mie3, host_mie4, host_gamA, host_gamR,
                        offset, special_lj, inum, nall, 300, maxspecial,
                        cell_size, gpu_split, screen);
    MLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    MLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void mie_gpu_clear() {
  MLMF.clear();
 }
 int ** mie_gpu_compute_n(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           double *sublo, double *subhi, int *tag, int **nspecial,
                           int **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
                           bool &success) {
  return MLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                      subhi, tag, nspecial, special, eflag, vflag, eatom,
                      vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 void mie_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success) {
  MLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
               firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 double mie_gpu_bytes() {
  return MLMF.host_memory_usage();
 }
--- a/lib/gpu/lal_soft.cpp
+++ b/lib/gpu/lal_soft.cpp
@ -0,0 +1,145 @@
 /***************************************************************************
                                   soft.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the soft pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifdef USE_OPENCL
 #include "soft_cl.h"
 #elif defined(USE_CUDART)
 const char *soft=0;
 #else
 #include "soft_cubin.h"
 #endif
 #include "lal_soft.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define SoftT Soft<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 SoftT::Soft() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 template <class numtyp, class acctyp>
 SoftT::~Soft() { 
  clear();
 }
 template <class numtyp, class acctyp>
 int SoftT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int SoftT::init(const int ntypes, double **host_cutsq,
                 double **host_prefactor, double **host_cut,
                 double *host_special_lj, const int nlocal,
                 const int nall, const int max_nbors,
                 const int maxspecial, const double cell_size,
                 const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,soft,"k_soft");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_ONLY);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_prefactor,
 			                   host_cut,host_cutsq);
  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
  dview.view(host_special_lj,4,*(this->ucl_device));
  ucl_copy(sp_lj,dview,false);
  _allocated=true;
  this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void SoftT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  coeff.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double SoftT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(Soft<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void SoftT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &coeff, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class Soft<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_soft.h
+++ b/lib/gpu/lal_soft.h
@ -0,0 +1,77 @@
 /***************************************************************************
                                    soft.h
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the soft pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_GAUSS_H
 #define LAL_GAYSS_H
 #include "lal_base_atomic.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class Soft : public BaseAtomic<numtyp, acctyp> {
 public:
  Soft();
  ~Soft(); 
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
           double **host_prefactor, double **host_cut,
           double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// coeff.x = prefactor, coeff.y = cut, coeff.z = cutsq
  UCL_D_Vec<numtyp4> coeff;
  /// Special LJ values
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kßernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_soft_ext.cpp
+++ b/lib/gpu/lal_soft_ext.cpp
@ -0,0 +1,120 @@
 /***************************************************************************
                                 soft_ext.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to soft acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_soft.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static Soft<PRECISION,ACC_PRECISION> SLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
                  double **host_cut, double *special_lj,
                  const int inum, const int nall, const int max_nbors,
                  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen) {
  SLMF.clear();
  gpu_mode=SLMF.device->gpu_mode();
  double gpu_split=SLMF.device->particle_split();
  int first_gpu=SLMF.device->first_device();
  int last_gpu=SLMF.device->last_device();
  int world_me=SLMF.device->world_me();
  int gpu_rank=SLMF.device->gpu_rank();
  int procs_per_gpu=SLMF.device->procs_per_gpu();
  SLMF.device->init_message(screen,"soft",first_gpu,last_gpu);
  bool message=false;
  if (SLMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut,
                      special_lj, inum, nall, 300,
                      maxspecial, cell_size, gpu_split, screen);
  SLMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut,
                        special_lj, inum, nall, 300, maxspecial,
                        cell_size, gpu_split, screen);
    SLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    SLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void soft_gpu_clear() {
  SLMF.clear();
 }
 int ** soft_gpu_compute_n(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           double *sublo, double *subhi, int *tag, int **nspecial,
                           int **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
                           bool &success) {
  return SLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                      subhi, tag, nspecial, special, eflag, vflag, eatom,
                      vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 void soft_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success) {
  SLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
               firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 double soft_gpu_bytes() {
  return SLMF.host_memory_usage();
 }
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@ -0,0 +1,167 @@
 /***************************************************************************
                                   sw.cpp
                             -------------------
                            W. Michael Brown (ORNL)
  Class for acceleration of the sw pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : Tue March 26, 2013
    email                : brownw@ornl.gov
 ***************************************************************************/
 #if defined(USE_OPENCL)
 #include "sw_cl.h"
 #elif defined(USE_CUDART)
 const char *lj=0;
 #else
 #include "sw_cubin.h"
 #endif
 #include "lal_sw.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define SWT SW<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 SWT::SW() : BaseThree<numtyp,acctyp>(), _allocated(false) {
 }
 template <class numtyp, class acctyp>
 SWT::~SW() { 
  clear();
 }
 template <class numtyp, class acctyp>
 int SWT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int SWT::init(const int nlocal, const int nall, const int max_nbors, 
              const double cell_size, const double gpu_split, FILE *_screen,
              const double epsilon, const double sigma,
              const double lambda, const double gamma,
              const double costheta, const double biga,
              const double bigb, const double powerp,
              const double powerq, const double cut, const double cutsq) {
  sw_epsilon=static_cast<numtyp>(epsilon);
  sw_sigma=static_cast<numtyp>(sigma);
  sw_lambda=static_cast<numtyp>(lambda);
  sw_gamma=static_cast<numtyp>(gamma);
  sw_costheta=static_cast<numtyp>(costheta);
  sw_biga=static_cast<numtyp>(biga);
  sw_bigb=static_cast<numtyp>(bigb);
  sw_powerp=static_cast<numtyp>(powerp);
  sw_powerq=static_cast<numtyp>(powerq);
  sw_cut=static_cast<numtyp>(cut);
  sw_cutsq=static_cast<numtyp>(cutsq);
  if (sw_cutsq>=sw_cut*sw_cut) 
    sw_cutsq=sw_cut*sw_cut-1e-4;
  int success;
  success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                           _screen,sw,"k_sw","k_sw_three_center",
                           "k_sw_three_end");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  shared_types=true;
  _allocated=true;
  this->_max_bytes=0;
  return 0;
 }
 template <class numtyp, class acctyp>
 void SWT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double SWT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(SW<numtyp,acctyp>);
 }
 #define KTHREADS this->_threads_per_atom
 #define JTHREADS this->_threads_per_atom
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  // Compute the block size and grid size to keep all cores busy
  int BX=this->block_pair();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  this->k_pair.set_size(GX,BX);
  this->k_pair.run(&this->atom->x, &this->nbor->dev_nbor, 
                   &this->_nbor_data->begin(), &this->ans->force,
                   &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, 
                   &this->_threads_per_atom, &sw_cut, &sw_epsilon, &sw_sigma,
                   &sw_biga, &sw_bigb, &sw_powerp, &sw_powerq, &sw_cutsq);
  BX=this->block_size();
  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                           (BX/(KTHREADS*JTHREADS)))); 
  this->k_three_center.set_size(GX,BX);
  this->k_three_center.run(&this->atom->x, &this->nbor->dev_nbor, 
                   &this->_nbor_data->begin(), &this->ans->force,
                   &this->ans->engv, &eflag, &vflag, &ainum, 
                   &nbor_pitch, &this->_threads_per_atom, &evatom,
                   &sw_cut, &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
                   &sw_costheta, &sw_cutsq);
  Answer<numtyp,acctyp> *end_ans;
  #ifdef THREE_CONCURRENT
  end_ans=this->ans2;
  #else
  end_ans=this->ans;
  #endif
  if (evatom!=0) {
    this->k_three_end_vatom.set_size(GX,BX);
    this->k_three_end_vatom.run(&this->atom->x, &this->nbor->dev_nbor, 
                          &this->_nbor_data->begin(), &end_ans->force,
                          &end_ans->engv, &eflag, &vflag, &ainum, 
                          &nbor_pitch, &this->_threads_per_atom, &sw_cut, 
                          &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
                          &sw_costheta, &sw_cutsq);
  } else {
    this->k_three_end.set_size(GX,BX);
    this->k_three_end.run(&this->atom->x, &this->nbor->dev_nbor, 
                          &this->_nbor_data->begin(), &end_ans->force,
                          &end_ans->engv, &eflag, &vflag, &ainum, 
                          &nbor_pitch, &this->_threads_per_atom, &sw_cut, 
                          &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
                          &sw_costheta, &sw_cutsq);
  }
  this->time_pair.stop();
 }
 template class SW<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_sw.h
+++ b/lib/gpu/lal_sw.h
@ -0,0 +1,73 @@
 /***************************************************************************
                                    sw.h
                             -------------------
                            W. Michael Brown (ORNL)
  Class for acceleration of the sw pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : Tue March 26, 2013
    email                : brownw@ornl.gov
 ***************************************************************************/
 #ifndef LAL_SW_H
 #define LAL_SW_H
 #include "lal_base_three.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class SW : public BaseThree<numtyp, acctyp> {
 public:
  SW();
  ~SW(); 
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int nlocal, const int nall, const int max_nbors, 
           const double cell_size, const double gpu_split, FILE *screen,
           const double epsilon, const double sigma,
           const double lambda, const double gamma,
           const double costheta, const double biga,
           const double bigb, const double powerp,
           const double powerq, const double cut, const double cutsq);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag, const int evatom);
  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta;
  numtyp sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq;
 };
 }
 #endif
--- a/lib/gpu/lal_sw_ext.cpp
+++ b/lib/gpu/lal_sw_ext.cpp
@ -0,0 +1,128 @@
 /***************************************************************************
                                 sw_ext.cpp
                             -------------------
                            W. Michael Brown (ORNL)
  Functions for LAMMPS access to sw acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : Tue March 26, 2013
    email                : brownw@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_sw.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static SW<PRECISION,ACC_PRECISION> SWMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int sw_gpu_init(const int inum, const int nall, const int max_nbors, 
                const double cell_size, int &gpu_mode, FILE *screen,
                const double sw_epsilon, const double sw_sigma,
                const double sw_lambda, const double sw_gamma,
                const double sw_costheta, const double sw_biga,
                const double sw_bigb, const double sw_powerp,
                const double sw_powerq, const double sw_cut, 
                const double sw_cutsq) {
  SWMF.clear();
  gpu_mode=SWMF.device->gpu_mode();
  double gpu_split=SWMF.device->particle_split();
  int first_gpu=SWMF.device->first_device();
  int last_gpu=SWMF.device->last_device();
  int world_me=SWMF.device->world_me();
  int gpu_rank=SWMF.device->gpu_rank();
  int procs_per_gpu=SWMF.device->procs_per_gpu();
  // disable host/device split for now
  if (gpu_split != 1.0) 
    return -8;
  SWMF.device->init_message(screen,"sw/gpu",first_gpu,last_gpu);
  bool message=false;
  if (SWMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
                      sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, 
                      sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
  SWMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
                        sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, 
                        sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, 
                        sw_cutsq);
    SWMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    SWMF.estimate_gpu_overhead();
  return init_ok;
 }
 void sw_gpu_clear() {
  SWMF.clear();
 }
 int ** sw_gpu_compute_n(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
                        double *sublo, double *subhi, int *tag, int **nspecial,
                        int **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        int **ilist, int **jnum, const double cpu_time,
                        bool &success) {
  return SWMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 void sw_gpu_compute(const int ago, const int nlocal, const int nall, 
                    const int nlist, double **host_x, int *host_type, 
                    int *ilist, int *numj, int **firstneigh, const bool eflag, 
                    const bool vflag, const bool eatom, const bool vatom, 
                    int &host_start, const double cpu_time, bool &success) {
  SWMF.compute(ago,nlocal,nall,nlist,host_x,host_type,ilist,numj,
               firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 double sw_gpu_bytes() {
  return SWMF.host_memory_usage();
 }