Changes from Mike Brown.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5278 f3b2605a-c512-4ea7-a41b-209d697bcdaa
2010-11-23 00:41:14 +00:00
parent 5a82c99485
commit 4e52d8d927
18 changed files with 3104 additions and 536 deletions
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@ -1,6 +1,7 @@
 # Install/unInstall package files in LAMMPS
 # edit Makefile.package to include/exclude GPU library
 # do not copy gayberne files if non-GPU version does not exist
 # do not copy charmm files if non-GPU version does not exist
 if (test $1 = 1) then
@ -12,14 +13,36 @@ if (test $1 = 1) then
    sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(gpu_SYSPATH) |' ../Makefile.package
    sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(gpu_SYSLIB) |' ../Makefile.package
  fi
-
+  
  if (test -e ../pair_gayberne.cpp) then
    cp pair_gayberne_gpu.cpp ..
    cp pair_gayberne_gpu.h ..
  fi
  if (test -e ../pair_lj_cut_coul_long.cpp) then
    cp pair_lj_cut_coul_long_gpu.cpp ..
    cp pair_lj_cut_coul_long_gpu.h ..
  fi
  if (test -e ../pair_cg_cmm.cpp) then
    cp pair_cg_cmm_gpu.cpp ..
    cp pair_cg_cmm_gpu.h ..
  fi
  if (test -e ../pair_cg_cmm_coul_long.cpp) then
    cp pair_cg_cmm_coul_long_gpu.cpp ..
    cp pair_cg_cmm_coul_long_gpu.h ..
  fi
  cp pair_lj_cut_gpu.cpp ..
  cp pair_lj96_cut_gpu.cpp ..
  cp pair_lj_cut_coul_cut_gpu.cpp ..
  cp pair_lj_cut_gpu.h ..
  cp pair_lj96_cut_gpu.h ..
  cp pair_lj_cut_coul_cut_gpu.h ..
  cp fix_gpu.cpp ..
  cp fix_gpu.h ..
 elif (test $1 = 0) then
@ -27,11 +50,23 @@ elif (test $1 = 0) then
    sed -i -e 's/[^ \t]*gpu //' ../Makefile.package
    sed -i -e 's/[^ \t]*gpu_[^ \t]*) //' ../Makefile.package
  fi
-
+  
  rm ../pair_gayberne_gpu.cpp
  rm ../pair_lj_cut_gpu.cpp
  rm ../pair_lj96_cut_gpu.cpp
  rm ../pair_lj_cut_coul_cut_gpu.cpp
  rm ../pair_lj_cut_coul_long_gpu.cpp
  rm ../pair_cg_cmm_gpu.cpp
  rm ../pair_cg_cmm_coul_long_gpu.cpp
  rm ../fix_gpu.cpp
  rm ../pair_gayberne_gpu.h
  rm ../pair_lj_cut_gpu.h
-
+  rm ../pair_lj96_cut_gpu.h
  rm ../pair_lj_cut_coul_cut_gpu.h
  rm ../pair_lj_cut_coul_long_gpu.h
  rm ../pair_cg_cmm_gpu.h
  rm ../pair_cg_cmm_coul_long_gpu.h
  rm ../fix_gpu.h
 fi
--- a/src/GPU/Package.sh
+++ b/src/GPU/Package.sh
@ -3,10 +3,40 @@
 # do not copy gayberne files if non-GPU version does not exist
 for file in *.cpp *.h; do
-  if (test $file == pair_gayberne_gpu.cpp -a ! -e ../pair_gayberne.cpp) then
+  if (test $file = pair_gayberne_gpu.cpp -a ! -e ../pair_gayberne.cpp) then
    continue
  fi
-  if (test $file == pair_gayberne_gpu.h -a ! -e ../pair_gayberne.cpp) then
+  if (test $file = pair_gayberne_gpu.h -a ! -e ../pair_gayberne.cpp) then
    continue
  fi
  if (test $file = pair_lj_cut_coul_long_gpu.cpp -a ! -e ../pair_lj_cut_coul_long.cpp) then
    continue
  fi
  if (test $file = pair_lj_cut_coul_long_gpu.h -a ! -e ../pair_lj_cut_coul_long.cpp) then
    continue
  fi
  if (test $file = pair_cg_cmm_gpu.cpp -a ! -e ../pair_cg_cmm.cpp) then
    continue
  fi
  if (test $file = pair_cg_cmm_gpu.h -a ! -e ../pair_cg_cmm.cpp) then
    continue
  fi
  if (test $file = pair_cg_cmm_coul_long_gpu.cpp -a ! -e ../pair_cg_cmm_coul_long.cpp) then
    continue
  fi
  if (test $file = pair_cg_cmm_coul_long_gpu.h -a ! -e ../pair_cg_cmm_coul_long.cpp) then
    continue
  fi
  if (test $file = pair_cg_cmm_coul_msm.cpp -a ! -e ../pair_cg_cmm.cpp) then
    continue
  fi
  if (test $file = pair_cg_cmm_coul_msm.h -a ! -e ../pair_cg_cmm.cpp) then
    continue
  fi
  if (test $file = pair_cg_cmm_coul_msm_gpu.cpp -a ! -e ../pair_cg_cmm.cpp) then
    continue
  fi
  if (test $file = pair_cg_cmm_coul_msm_gpu.h -a ! -e ../pair_cg_cmm.cpp) then
    continue
  fi
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@ -0,0 +1,148 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #include "string.h"
 #include "stdlib.h"
 #include "fix_gpu.h"
 #include "atom.h"
 #include "force.h"
 #include "pair.h"
 #include "respa.h"
 #include "input.h"
 #include "error.h"
 #include "timer.h"
 #include "modify.h"
 #include "domain.h"
 using namespace LAMMPS_NS;
 enum{GPU_FORCE, GPU_NEIGH};
 extern bool lmp_init_device(const int first_gpu, const int last_gpu,
                            const int gpu_mode, const double particle_split);
 extern void lmp_clear_device();
 extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
                             double **vatom, double *virial, double &ecoul);
 /* ---------------------------------------------------------------------- */
 FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
  Fix(lmp, narg, arg)
 {
  if (narg < 7) error->all("Illegal fix gpu command");
  if (strcmp(arg[1],"all") != 0)
    error->all("Illegal fix gpu command");
  int gpu_mode, first_gpu, last_gpu;
  double particle_split;
  if (strcmp(arg[3],"force") == 0)
    gpu_mode = GPU_FORCE;
  else if (strcmp(arg[3],"force/neigh") == 0) {
    gpu_mode = GPU_NEIGH;
    if (domain->triclinic)
      error->all("Cannot use force/neigh with triclinic box.");
  } else
    error->all("Illegal fix gpu command.");
  first_gpu = atoi(arg[4]);
  last_gpu = atoi(arg[5]);
  particle_split = force->numeric(arg[6]);
  if (particle_split==0 || particle_split>1)
    error->all("Illegal fix gpu command.");
  if (!lmp_init_device(first_gpu,last_gpu,gpu_mode,particle_split))
    error->one("Could not find or initialize a specified accelerator device.");
 }
 /* ---------------------------------------------------------------------- */
 FixGPU::~FixGPU()
 {
  lmp_clear_device();
 }
 /* ---------------------------------------------------------------------- */
 int FixGPU::setmask()
 {
  int mask = 0;
  mask |= POST_FORCE;
  mask |= MIN_POST_FORCE;
  return mask;
 }
 /* ---------------------------------------------------------------------- */
 void FixGPU::init()
 {
  // Can only have 1 gpu fix that must be the first fix for a run
  if ((void*)modify->fix[0] != (void*)this)
    error->all("GPU is not the first fix for this run.");
 }
 /* ---------------------------------------------------------------------- */
 void FixGPU::setup(int vflag)
 {
  post_force(vflag);
 }
 /* ---------------------------------------------------------------------- */
 void FixGPU::min_setup(int vflag)
 {
  post_force(vflag);
 }
 /* ---------------------------------------------------------------------- */
 void FixGPU::post_force(int vflag)
 {
  timer->stamp();
  double lvirial[6];
  for (int i = 0; i < 6; i++) lvirial[i] = 0.0;
  double my_eng = lmp_gpu_forces(atom->f, atom->torque, force->pair->eatom,
                                 force->pair->vatom, lvirial,
                                 force->pair->eng_coul);
  force->pair->eng_vdwl += my_eng;
  force->pair->virial[0] += lvirial[0];
  force->pair->virial[1] += lvirial[1];
  force->pair->virial[2] += lvirial[2];
  force->pair->virial[3] += lvirial[3];
  force->pair->virial[4] += lvirial[4];
  force->pair->virial[5] += lvirial[5];
  if (force->pair->vflag_fdotr) force->pair->virial_compute();
  timer->stamp(TIME_PAIR);
 }
 /* ---------------------------------------------------------------------- */
 void FixGPU::min_post_force(int vflag)
 {
  post_force(vflag);
 }
 /* ---------------------------------------------------------------------- */
 double FixGPU::memory_usage()
 {
  double bytes = 0.0;
  // Memory usage currently returned by pair routine
  return bytes;
 }
--- a/src/GPU/fix_gpu.h
+++ b/src/GPU/fix_gpu.h
@ -0,0 +1,45 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef FIX_CLASS
 FixStyle(gpu,FixGPU)
 #else
 #ifndef LMP_FIX_GPU_H
 #define LMP_FIX_GPU_H
 #include "fix.h"
 namespace LAMMPS_NS {
 class FixGPU : public Fix {
 public:
  FixGPU(class LAMMPS *, int, char **);
  ~FixGPU();
  int setmask();
  void init();
  void setup(int);
  void min_setup(int);
  void post_force(int);
  void min_post_force(int);
  double memory_usage();
 private:
 };
 }
 #endif
 #endif
--- a/src/GPU/pair_cg_cmm_coul_long_gpu.cpp
+++ b/src/GPU/pair_cg_cmm_coul_long_gpu.cpp
@ -0,0 +1,499 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Mike Brown (SNL)
 ------------------------------------------------------------------------- */
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "pair_cg_cmm_coul_long_gpu.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "integrate.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
 #include "update.h"
 #include "domain.h"
 #include "string.h"
 #include "kspace.h"
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #define EWALD_F   1.12837917
 #define EWALD_P   0.3275911
 #define A1        0.254829592
 #define A2       -0.284496736
 #define A3        1.421413741
 #define A4       -1.453152027
 #define A5        1.061405429
 // External functions from cuda library for atom decomposition
 bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                   double **host_lj1, double **host_lj2, double **host_lj3,
                   double **host_lj4, double **offset, double *special_lj,
                   const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size, int &gpu_mode,
                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
                   double *host_special_coul, const double qqrd2e,
                   const double g_ewald);
 void cmml_gpu_clear();
 int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum,
 	 	         const int nall, double **host_x, int *host_type, 
                         double *boxlo, double *boxhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         const double cpu_time, bool &success, double *host_q);
 void cmml_gpu_compute(const int timestep, const int ago, const int inum,
 	 	      const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh,
 		      const bool eflag, const bool vflag, const bool eatom,
                      const bool vatom, int &host_start, const double cpu_time,
                      bool &success, double *host_q);
 double cmml_gpu_bytes();
 using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 PairCGCMMCoulLongGPU::PairCGCMMCoulLongGPU(LAMMPS *lmp) : PairCGCMMCoulLong(lmp), gpu_mode(GPU_PAIR)
 {
  respa_enable = 0;
  cpu_time = 0.0;
 }
 /* ----------------------------------------------------------------------
   free all arrays
 ------------------------------------------------------------------------- */
 PairCGCMMCoulLongGPU::~PairCGCMMCoulLongGPU()
 {
  cmml_gpu_clear();
 }
 /* ---------------------------------------------------------------------- */
 void PairCGCMMCoulLongGPU::compute(int eflag, int vflag)
 {
  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = vflag_fdotr = 0;
  int nall = atom->nlocal + atom->nghost;
  int inum, host_start;
  bool success = true;
  if (gpu_mode == GPU_NEIGH) {
    inum = atom->nlocal;
    gpulist = cmml_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
 			         atom->x, atom->type, domain->sublo,
 				 domain->subhi, atom->tag, atom->nspecial,
                                 atom->special, eflag, vflag, eflag_atom,
                                 vflag_atom, host_start, cpu_time, success,
                                 atom->q);
  } else {
    inum = list->inum;
    cmml_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
 		     atom->type, list->ilist, list->numneigh, list->firstneigh,
 		     eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
                     success, atom->q);
  }
  if (!success)
    error->one("Out of memory on GPGPU");
  if (host_start<inum) {
    cpu_time = MPI_Wtime();
    if (gpu_mode == GPU_NEIGH)
      cpu_compute(gpulist, host_start, eflag, vflag);
    else
      cpu_compute(host_start, eflag, vflag);
    cpu_time = MPI_Wtime() - cpu_time;
  }
 }
 /* ----------------------------------------------------------------------
   init specific to this pair style
 ------------------------------------------------------------------------- */
 void PairCGCMMCoulLongGPU::init_style()
 {
  cut_respa = NULL;
  if (!atom->q_flag)
    error->all("Pair style cg/cmm/coul/long requires atom attribute q");
  if (force->pair_match("gpu",0) == NULL)
    error->all("Cannot use pair hybrid with multiple GPU pair styles");
  // Repeat cutsq calculation because done after call to init_style
  double maxcut = -1.0;
  double cut;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
        cut = init_one(i,j);
        cut *= cut;
        if (cut > maxcut)
          maxcut = cut;
        cutsq[i][j] = cutsq[j][i] = cut;
      } else
        cutsq[i][j] = cutsq[j][i] = 0.0;
    }
  }
  double cell_size = sqrt(maxcut) + neighbor->skin;
  // insure use of KSpace long-range solver, set g_ewald
  if (force->kspace == NULL)
    error->all("Pair style is incompatible with KSpace style");
  g_ewald = force->kspace->g_ewald;
  // setup force tables
  if (ncoultablebits) init_tables();
  int maxspecial=0;
  if (atom->molecular)
    maxspecial=atom->maxspecial;
  bool init_ok = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3,
                               lj4, offset, force->special_lj, atom->nlocal,
                               atom->nlocal+atom->nghost, 300, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq,
                               cut_coulsq_global, force->special_coul,
                               force->qqrd2e, g_ewald);
  if (!init_ok)
    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
  if (force->newton_pair) 
    error->all("Cannot use newton pair with GPU cg/cmm pair style");
  if (gpu_mode != GPU_NEIGH) {
    int irequest = neighbor->request(this);
    neighbor->requests[irequest]->half = 0;
    neighbor->requests[irequest]->full = 1;
  }
 }
 /* ---------------------------------------------------------------------- */
 double PairCGCMMCoulLongGPU::memory_usage()
 {
  double bytes = Pair::memory_usage();
  return bytes + cmml_gpu_bytes();
 }
 /* ---------------------------------------------------------------------- */
 void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
 {
  int i,j,ii,jj,inum,jnum,itype,jtype,itable;
  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz;
  double fraction,table;
  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
  double grij,expm2,prefactor,t,erfc;
  int *ilist,*jlist,*numneigh,**firstneigh;
  double rsq;
  double **x = atom->x;
  double **f = atom->f;
  double *q = atom->q;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  double *special_coul = force->special_coul;
  double *special_lj = force->special_lj;
  double qqrd2e = force->qqrd2e;
  inum = list->inum;
  ilist = list->ilist;
  numneigh = list->numneigh;
  firstneigh = list->firstneigh;
  // loop over neighbors of my atoms
  for (ii = start; ii < inum; ii++) {
    i = ilist[ii];
    qtmp = q[i];
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    jlist = firstneigh[i];
    jnum = numneigh[i];
    for (jj = 0; jj < jnum; jj++) {
      j = jlist[jj];
      if (j < nall) factor_coul = factor_lj = 1.0;
      else {
 	factor_coul = special_coul[j/nall];
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      const double delx = xtmp - x[j][0];
      const double dely = ytmp - x[j][1];
      const double delz = ztmp - x[j][2];
      const double rsq = delx*delx + dely*dely + delz*delz;
      const int jtype = type[j];
      double evdwl = 0.0;
      double ecoul = 0.0;
      double fpair = 0.0;
      if (rsq < cutsq[itype][jtype]) {
        const double r2inv = 1.0/rsq;
        const int cgt=cg_type[itype][jtype];
        double forcelj  = 0.0;
        double forcecoul = 0.0;
        if (rsq < cut_ljsq[itype][jtype]) {
          forcelj=factor_lj;
          if (eflag) evdwl=factor_lj;
          if (cgt == CG_LJ12_4) {
            const double r4inv=r2inv*r2inv;
            forcelj *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
                       - lj2[itype][jtype]);
            if (eflag) {
              evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
                       - lj4[itype][jtype]) - offset[itype][jtype];
            }
          } else if (cgt == CG_LJ9_6) {
            const double r3inv = r2inv*sqrt(r2inv);
            const double r6inv = r3inv*r3inv;
            forcelj *= r6inv*(lj1[itype][jtype]*r3inv
                       - lj2[itype][jtype]);
            if (eflag) {
              evdwl *= r6inv*(lj3[itype][jtype]*r3inv
                        - lj4[itype][jtype]) - offset[itype][jtype];
            }
          } else {
            const double r6inv = r2inv*r2inv*r2inv;
            forcelj *= r6inv*(lj1[itype][jtype]*r6inv
                       - lj2[itype][jtype]);
            if (eflag) {
              evdwl *= r6inv*(lj3[itype][jtype]*r6inv
                       - lj4[itype][jtype]) - offset[itype][jtype];
            }
          }
        }
        if (rsq < cut_coulsq_global) {
          if (!ncoultablebits || rsq <= tabinnersq) {
            const double r = sqrt(rsq);
            const double grij = g_ewald * r;
            const double expm2 = exp(-grij*grij);
            const double t = 1.0 / (1.0 + EWALD_P*grij);
            const double erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
            const double prefactor = qqrd2e * qtmp*q[j]/r;
            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
            if (eflag) ecoul = prefactor*erfc;
            if (factor_coul < 1.0) {
              forcecoul -= (1.0-factor_coul)*prefactor;
              if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
            }
          } else {
            union_int_float_t rsq_lookup;
            rsq_lookup.f = rsq;
            int itable = rsq_lookup.i & ncoulmask;
            itable >>= ncoulshiftbits;
            const double fraction = (rsq_lookup.f - rtable[itable]) *
                                     drtable[itable];
            const double table = ftable[itable] + fraction*dftable[itable];
            forcecoul = qtmp*q[j] * table;
            if (eflag) {
              const double table2 = etable[itable] + fraction*detable[itable];
              ecoul = qtmp*q[j] * table2;
            }
            if (factor_coul < 1.0) {
              const double table2 = ctable[itable] + fraction*dctable[itable];
              const double prefactor = qtmp*q[j] * table2;
              forcecoul -= (1.0-factor_coul)*prefactor;
              if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
            }
          }
        }
        fpair = (forcecoul + forcelj) * r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
      }
    }
  }
 }
 /* ---------------------------------------------------------------------- */
 void PairCGCMMCoulLongGPU::cpu_compute(int *nbors, int start, int eflag,
                                      int vflag)
 {
  int i,j,jnum,itype,jtype,itable;
  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz;
  double fraction,table;
  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
  double grij,expm2,prefactor,t,erfc;
  double rsq;
  double **x = atom->x;
  double **f = atom->f;
  double *q = atom->q;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  int stride = nlocal-start;
  double *special_coul = force->special_coul;
  double *special_lj = force->special_lj;
  double qqrd2e = force->qqrd2e;
  // loop over neighbors of my atoms
  for (i = start; i < nlocal; i++) {
    qtmp = q[i];
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    int *nbor = nbors + i - start;
    jnum = *nbor;
    nbor += stride;
    int *nbor_end = nbor + stride * jnum;
    for (; nbor<nbor_end; nbor+=stride) {
      j = *nbor;
      if (j < nall) factor_coul = factor_lj = 1.0;
      else {
 	factor_coul = special_coul[j/nall];
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      double evdwl = 0.0;
      double ecoul = 0.0;
      double fpair = 0.0;
      if (rsq < cutsq[itype][jtype]) {
        const double r2inv = 1.0/rsq;
        const int cgt=cg_type[itype][jtype];
        double forcelj  = 0.0;
        double forcecoul = 0.0;
        if (rsq < cut_ljsq[itype][jtype]) {
          forcelj=factor_lj;
          if (eflag) evdwl=factor_lj;
          if (cgt == CG_LJ12_4) {
            const double r4inv=r2inv*r2inv;
            forcelj *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
                       - lj2[itype][jtype]);
            if (eflag) {
              evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
                       - lj4[itype][jtype]) - offset[itype][jtype];
            }
          } else if (cgt == CG_LJ9_6) {
            const double r3inv = r2inv*sqrt(r2inv);
            const double r6inv = r3inv*r3inv;
            forcelj *= r6inv*(lj1[itype][jtype]*r3inv
                       - lj2[itype][jtype]);
            if (eflag) {
              evdwl *= r6inv*(lj3[itype][jtype]*r3inv
                        - lj4[itype][jtype]) - offset[itype][jtype];
            }
          } else {
            const double r6inv = r2inv*r2inv*r2inv;
            forcelj *= r6inv*(lj1[itype][jtype]*r6inv
                       - lj2[itype][jtype]);
            if (eflag) {
              evdwl *= r6inv*(lj3[itype][jtype]*r6inv
                       - lj4[itype][jtype]) - offset[itype][jtype];
            }
          }
        }
        if (rsq < cut_coulsq_global) {
          if (!ncoultablebits || rsq <= tabinnersq) {
            const double r = sqrt(rsq);
            const double grij = g_ewald * r;
            const double expm2 = exp(-grij*grij);
            const double t = 1.0 / (1.0 + EWALD_P*grij);
            const double erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
            const double prefactor = qqrd2e * qtmp*q[j]/r;
            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
            if (eflag) ecoul = prefactor*erfc;
            if (factor_coul < 1.0) {
              forcecoul -= (1.0-factor_coul)*prefactor;
              if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
            }
          } else {
            union_int_float_t rsq_lookup;
            rsq_lookup.f = rsq;
            int itable = rsq_lookup.i & ncoulmask;
            itable >>= ncoulshiftbits;
            const double fraction = (rsq_lookup.f - rtable[itable]) *
                                     drtable[itable];
            const double table = ftable[itable] + fraction*dftable[itable];
            forcecoul = qtmp*q[j] * table;
            if (eflag) {
              const double table2 = etable[itable] + fraction*detable[itable];
              ecoul = qtmp*q[j] * table2;
            }
            if (factor_coul < 1.0) {
              const double table2 = ctable[itable] + fraction*dctable[itable];
              const double prefactor = qtmp*q[j] * table2;
              forcecoul -= (1.0-factor_coul)*prefactor;
              if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
            }
          }
        }
        fpair = (forcecoul + forcelj) * r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
        if (j<start) {
  	  if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
        } else {
          if (j<nlocal) {
 	    f[j][0] -= delx*fpair;
 	    f[j][1] -= dely*fpair;
 	    f[j][2] -= delz*fpair;
  	  }
 	  if (evflag) ev_tally(i,j,nlocal,0,
 			       evdwl,ecoul,fpair,delx,dely,delz);
        }
      }
    }
  }
 }
--- a/src/GPU/pair_cg_cmm_coul_long_gpu.h
+++ b/src/GPU/pair_cg_cmm_coul_long_gpu.h
@ -0,0 +1,48 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(cg/cmm/coul/long/gpu,PairCGCMMCoulLongGPU)
 #else
 #ifndef LMP_PAIR_CG_CMM_COUL_LONG_GPU_H
 #define LMP_PAIR_CG_CMM_COUL_LONG_GPU_H
 #include "pair_cg_cmm_coul_long.h"
 namespace LAMMPS_NS {
 class PairCGCMMCoulLongGPU : public PairCGCMMCoulLong {
 public:
  PairCGCMMCoulLongGPU(LAMMPS *lmp);
  ~PairCGCMMCoulLongGPU();
  void cpu_compute(int, int, int);
  void cpu_compute(int *, int, int, int);
  void compute(int, int);
  void init_style();
  double memory_usage();
 enum { GPU_PAIR, GPU_NEIGH };
 private:
  int gpu_mode;
  double cpu_time;
  int *gpulist;
 };
 }
 #endif
 #endif
--- a/src/GPU/pair_cg_cmm_gpu.cpp
+++ b/src/GPU/pair_cg_cmm_gpu.cpp
@ -0,0 +1,362 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Mike Brown (SNL)
 ------------------------------------------------------------------------- */
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "pair_cg_cmm_gpu.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "integrate.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
 #include "update.h"
 #include "domain.h"
 #include "string.h"
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 // External functions from cuda library for atom decomposition
 bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, 
                  double **host_lj1, double **host_lj2, double **host_lj3,
                  double **host_lj4, double **offset, double *special_lj,
                  const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen);
 void cmm_gpu_clear();
 int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum,
 	 	        const int nall, double **host_x, int *host_type, 
                        double *boxlo, double *boxhi, int *tag, int **nspecial,
                        int **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        const double cpu_time, bool &success);
 void cmm_gpu_compute(const int timestep, const int ago, const int inum,
 	 	     const int nall, double **host_x, int *host_type,
                     int *ilist, int *numj, int **firstneigh,
 		     const bool eflag, const bool vflag, const bool eatom,
                     const bool vatom, int &host_start, const double cpu_time,
                     bool &success);
 double cmm_gpu_bytes();
 using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 PairCGCMMGPU::PairCGCMMGPU(LAMMPS *lmp) : PairCGCMM(lmp), gpu_mode(GPU_PAIR)
 {
  respa_enable = 0;
  cpu_time = 0.0;
 }
 /* ----------------------------------------------------------------------
   free all arrays
 ------------------------------------------------------------------------- */
 PairCGCMMGPU::~PairCGCMMGPU()
 {
  cmm_gpu_clear();
 }
 /* ---------------------------------------------------------------------- */
 void PairCGCMMGPU::compute(int eflag, int vflag)
 {
  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = vflag_fdotr = 0;
  int nall = atom->nlocal + atom->nghost;
  int inum, host_start;
  bool success = true;
  if (gpu_mode == GPU_NEIGH) {
    inum = atom->nlocal;
    gpulist = cmm_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
 			        atom->x, atom->type, domain->sublo,
 				domain->subhi, atom->tag, atom->nspecial,
                                atom->special, eflag, vflag, eflag_atom,
                                vflag_atom, host_start, cpu_time, success);
  } else {
    inum = list->inum;
    cmm_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
 		    atom->type, list->ilist, list->numneigh, list->firstneigh,
 		    eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
                    success);
  }
  if (!success)
    error->one("Out of memory on GPGPU");
  if (host_start<inum) {
    cpu_time = MPI_Wtime();
    if (gpu_mode == GPU_NEIGH)
      cpu_compute(gpulist, host_start, eflag, vflag);
    else
      cpu_compute(host_start, eflag, vflag);
    cpu_time = MPI_Wtime() - cpu_time;
  }
 }
 /* ----------------------------------------------------------------------
   init specific to this pair style
 ------------------------------------------------------------------------- */
 void PairCGCMMGPU::init_style()
 {
  cut_respa = NULL;
  if (force->pair_match("gpu",0) == NULL)
    error->all("Cannot use pair hybrid with multiple GPU pair styles");
  // Repeat cutsq calculation because done after call to init_style
  double maxcut = -1.0;
  double cut;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
        cut = init_one(i,j);
        cut *= cut;
        if (cut > maxcut)
          maxcut = cut;
        cutsq[i][j] = cutsq[j][i] = cut;
      } else
        cutsq[i][j] = cutsq[j][i] = 0.0;
    }
  }
  double cell_size = sqrt(maxcut) + neighbor->skin;
  int maxspecial=0;
  if (atom->molecular)
    maxspecial=atom->maxspecial;
  bool init_ok = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4,
                              offset, force->special_lj, atom->nlocal,
                              atom->nlocal+atom->nghost, 300, maxspecial,
                              cell_size, gpu_mode, screen);
  if (!init_ok)
    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
  if (force->newton_pair) 
    error->all("Cannot use newton pair with GPU CGCMM pair style");
  if (gpu_mode != GPU_NEIGH) {
    int irequest = neighbor->request(this);
    neighbor->requests[irequest]->half = 0;
    neighbor->requests[irequest]->full = 1;
  }
 }
 /* ---------------------------------------------------------------------- */
 double PairCGCMMGPU::memory_usage()
 {
  double bytes = Pair::memory_usage();
  return bytes + cmm_gpu_bytes();
 }
 /* ---------------------------------------------------------------------- */
 void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) {
  int i,j,ii,jj,inum,jnum,itype,jtype;
  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
  double rsq,r2inv,r6inv,forcelj,factor_lj;
  int *ilist,*jlist,*numneigh,**firstneigh;
  double **x = atom->x;
  double **f = atom->f;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  double *special_lj = force->special_lj;
  inum = list->inum;
  ilist = list->ilist;
  numneigh = list->numneigh;
  firstneigh = list->firstneigh;
  // loop over neighbors of my atoms
  for (ii = start; ii < inum; ii++) {
    i = ilist[ii];
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    jlist = firstneigh[i];
    jnum = numneigh[i];
    for (jj = 0; jj < jnum; jj++) {
      j = jlist[jj];
      if (j < nall) factor_lj = 1.0;
      else {
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      if (rsq < cutsq[itype][jtype]) {
        const int cgt=cg_type[itype][jtype];
        r2inv = 1.0/rsq;
 	fpair = factor_lj;
 	if (eflag) evdwl = factor_lj;
 	if (cgt == CG_LJ12_4) {
 	  const double r4inv = r2inv*r2inv;
 	  fpair *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
 			  - lj2[itype][jtype]);
 	  if (eflag) {
 	    evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
 			    - lj4[itype][jtype]) - offset[itype][jtype];
 	  }
 	} else if (cgt == CG_LJ9_6) {
 	  const double r3inv = r2inv*sqrt(r2inv);
 	  const double r6inv = r3inv*r3inv;
 	  fpair *= r6inv*(lj1[itype][jtype]*r3inv
 			  - lj2[itype][jtype]);
 	  if (eflag) {
 	    evdwl *= r6inv*(lj3[itype][jtype]*r3inv
 			    - lj4[itype][jtype]) - offset[itype][jtype];
 	  }
 	} else {
 	  const double r6inv = r2inv*r2inv*r2inv;
 	  fpair *= r6inv*(lj1[itype][jtype]*r6inv
 			  - lj2[itype][jtype]);
 	  if (eflag) {
 	    evdwl *= r6inv*(lj3[itype][jtype]*r6inv
 			    - lj4[itype][jtype]) - offset[itype][jtype];
 	  }
 	}
        fpair *= r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
      }
    }
  }
 }
 /* ---------------------------------------------------------------------- */
 void PairCGCMMGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
  int i,j,itype,jtype;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  int stride = nlocal-start;
  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
  double rsq,r2inv,r6inv,forcelj,factor_lj;
  double *special_lj = force->special_lj;
  double **x = atom->x;
  double **f = atom->f;
  int *type = atom->type;
  // loop over neighbors of my atoms
  for (i = start; i < nlocal; i++) {
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    int *nbor = nbors + i - start;
    int jnum = *nbor;
    nbor += stride;
    int *nbor_end = nbor + stride * jnum;
    for (; nbor<nbor_end; nbor+=stride) {
      j = *nbor;
      if (j < nall) factor_lj = 1.0;
      else {
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      if (rsq < cutsq[itype][jtype]) {
        const int cgt=cg_type[itype][jtype];
        r2inv = 1.0/rsq;
 	fpair = factor_lj;
 	if (eflag) evdwl = factor_lj;
 	if (cgt == CG_LJ12_4) {
 	  const double r4inv = r2inv*r2inv;
 	  fpair *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
 			  - lj2[itype][jtype]);
 	  if (eflag) {
 	    evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
 			    - lj4[itype][jtype]) - offset[itype][jtype];
 	  }
 	} else if (cgt == CG_LJ9_6) {
 	  const double r3inv = r2inv*sqrt(r2inv);
 	  const double r6inv = r3inv*r3inv;
 	  fpair *= r6inv*(lj1[itype][jtype]*r3inv
 			  - lj2[itype][jtype]);
 	  if (eflag) {
 	    evdwl *= r6inv*(lj3[itype][jtype]*r3inv
 			    - lj4[itype][jtype]) - offset[itype][jtype];
 	  }
 	} else {
 	  const double r6inv = r2inv*r2inv*r2inv;
 	  fpair *= r6inv*(lj1[itype][jtype]*r6inv
 			  - lj2[itype][jtype]);
 	  if (eflag) {
 	    evdwl *= r6inv*(lj3[itype][jtype]*r6inv
 			    - lj4[itype][jtype]) - offset[itype][jtype];
 	  }
 	}
        fpair *= r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
        if (j<start) {
  	  if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
        } else {
          if (j<nlocal) {
 	    f[j][0] -= delx*fpair;
 	    f[j][1] -= dely*fpair;
 	    f[j][2] -= delz*fpair;
  	  }
 	  if (evflag) ev_tally(i,j,nlocal,0,
 			       evdwl,0.0,fpair,delx,dely,delz);
 	}
      }
    }
  }
 }
--- a/src/GPU/pair_cg_cmm_gpu.h
+++ b/src/GPU/pair_cg_cmm_gpu.h
@ -0,0 +1,47 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(cg/cmm/gpu,PairCGCMMGPU)
 #else
 #ifndef LMP_PAIR_CG_CMM_GPU_H
 #define LMP_PAIR_CG_CMM_GPU_H
 #include "pair_cg_cmm.h"
 namespace LAMMPS_NS {
 class PairCGCMMGPU : public PairCGCMM {
 public:
  PairCGCMMGPU(LAMMPS *lmp);
  ~PairCGCMMGPU();
  void cpu_compute(int, int, int);
  void cpu_compute(int *, int, int, int);
  void compute(int, int);
  void init_style();
  double memory_usage();
 enum { GPU_PAIR, GPU_NEIGH };
 private:
  int gpu_mode;
  double cpu_time;
  int *gpulist;
 };
 }
 #endif
 #endif
--- a/src/GPU/pair_gayberne_gpu.cpp
+++ b/src/GPU/pair_gayberne_gpu.cpp
@ -31,123 +31,47 @@
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
-
+#include "domain.h"
-#include <string>
+#include "update.h"
-
+#include "string.h"
 #ifdef GB_GPU_OMP
 #include "omp.h"
 #endif
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #ifndef WINDLL
 // External functions from cuda library for atom decomposition
-bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
+bool gb_gpu_init(const int ntypes, const double gamma, const double upsilon,
-                 const double upsilon, const double mu, double **shape,
+                 const double mu, double **shape, double **well, double **cutsq,
-                 double **well, double **cutsq, double **sigma, 
+                 double **sigma, double **epsilon, double *host_lshape,
-                 double **epsilon, double *host_lshape, int **form,
+                 int **form, double **host_lj1, double **host_lj2,
-                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj3, double **host_lj4, double **offset,
-                 double **host_lj4, double **offset, double *special_lj, 
+                 double *special_lj, const int nlocal, const int nall,
-                 const int nlocal, const int nall, const int max_nbors, 
+                 const int max_nbors, const double cell_size,
-                 const int thread, const int gpu_id);
+                 int &gpu_mode, FILE *screen);
-void gb_gpu_clear(const int thread);
+void gb_gpu_clear();
-int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
+int * gb_gpu_compute_n(const int timestep, const int ago, const int inum,
-                         int *ilist, const int *numj, const int *type,
+	 	       const int nall, double **host_x, int *host_type,
-                         const int thread, bool &success);
+                       double *boxlo, double *boxhi, const bool eflag,
-void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag, 
+		       const bool vflag, const bool eatom, const bool vatom,
-                  const int thread);
+                       int &host_start, const double cpu_time, bool &success,
-void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type, 
+		       double **host_quat);
-                 const bool rebuild, const int thread);
+int * gb_gpu_compute(const int timestep, const int ago, const int inum,
-void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
+	 	     const int nall, double **host_x, int *host_type,
-                     const int thread);
+                     int *ilist, int *numj, int **firstneigh,
-double gb_gpu_forces(double **f, double **tor, const int *ilist,
+		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool eflag, const bool vflag, const bool eflag_atom,
+                     const bool vatom, int &host_start, const double cpu_time,
-                     const bool vflag_atom, double *eatom, double **vatom,
+                     bool &success, double **host_quat);
                     double *virial, const int thread);
 void gb_gpu_name(const int gpu_id, const int max_nbors, char * name);
 void gb_gpu_time(const int thread);
 int gb_gpu_num_devices();
 double gb_gpu_bytes();
 #else
 #include <windows.h>
 typedef bool (*_gb_gpu_init)(int &ij_size, const int ntypes, const double gamma,
                 const double upsilon, const double mu, double **shape,
                 double **well, double **cutsq, double **sigma, 
                 double **epsilon, double *host_lshape, int **form,
                 double **host_lj1, double **host_lj2, double **host_lj3, 
                 double **host_lj4, double **offset, double *special_lj, 
                 const int nlocal, const int nall, const int max_nbors, 
                 const int thread, const int gpu_id);
 typedef void (*_gb_gpu_clear)(const int thread);
 typedef int * (*_gb_gpu_reset_nbors)(const int nall, const int nlocal, 
                 const int inum, int *ilist, const int *numj, const int *type,
                 const int thread, bool &success);
 typedef void (*_gb_gpu_nbors)(const int *ij, const int num_ij, const bool eflag, 
                 const int thread);
 typedef void (*_gb_gpu_atom)(double **host_x, double **host_quat, 
                 const int *host_type, const bool rebuild, const int thread);
 typedef void (*_gb_gpu_gayberne)(const bool eflag, const bool vflag, 
                 const bool rebuild, const int thread);
 typedef double (*_gb_gpu_forces)(double **f, double **tor, const int *ilist,
                 const bool eflag, const bool vflag, const bool eflag_atom,
                 const bool vflag_atom, double *eatom, double **vatom,
                 double *virial, const int thread);
 typedef void (*_gb_gpu_name)(const int gpu_id, const int max_nbors, 
                 char * name);
 typedef void (*_gb_gpu_time)(const int thread);
 typedef int (*_gb_gpu_num_devices)();
 typedef double (*_gb_gpu_bytes)();
 _gb_gpu_init gb_gpu_init;
 _gb_gpu_clear gb_gpu_clear;
 _gb_gpu_reset_nbors gb_gpu_reset_nbors;
 _gb_gpu_nbors gb_gpu_nbors;
 _gb_gpu_atom gb_gpu_atom;
 _gb_gpu_gayberne gb_gpu_gayberne;
 _gb_gpu_forces gb_gpu_forces;
 _gb_gpu_name gb_gpu_name;
 _gb_gpu_time gb_gpu_time;
 _gb_gpu_num_devices gb_gpu_num_devices;
 _gb_gpu_bytes gb_gpu_bytes;
 #endif
 using namespace LAMMPS_NS;
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 /* ---------------------------------------------------------------------- */
-PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0),
+PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp),
-                                                omp_chunk(0), nthreads(1),
+                                                gpu_mode(GPU_PAIR)
                                                multi_gpu_mode(ONE_NODE),
                                                multi_gpu_param(0),
                                                output_time(false)
 {
  ij_new[0]=NULL;
 #ifdef WINDLL
  HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
  if (hinstLib == NULL) {
    printf("\nUnable to load gpu.dll\n");
    exit(1);
  }
  gb_gpu_init=(_gb_gpu_init)GetProcAddress(hinstLib,"gb_gpu_init");
  gb_gpu_clear=(_gb_gpu_clear)GetProcAddress(hinstLib,"gb_gpu_clear");
  gb_gpu_reset_nbors=(_gb_gpu_reset_nbors)GetProcAddress(hinstLib,"gb_gpu_reset_nbors");
  gb_gpu_nbors=(_gb_gpu_nbors)GetProcAddress(hinstLib,"gb_gpu_nbors");
  gb_gpu_atom=(_gb_gpu_atom)GetProcAddress(hinstLib,"gb_gpu_atom");
  gb_gpu_gayberne=(_gb_gpu_gayberne)GetProcAddress(hinstLib,"gb_gpu_gayberne");
  gb_gpu_forces=(_gb_gpu_forces)GetProcAddress(hinstLib,"gb_gpu_forces");
  gb_gpu_name=(_gb_gpu_name)GetProcAddress(hinstLib,"gb_gpu_name");
  gb_gpu_time=(_gb_gpu_time)GetProcAddress(hinstLib,"gb_gpu_time");
  gb_gpu_num_devices=(_gb_gpu_num_devices)GetProcAddress(hinstLib,"gb_gpu_num_devices");
  gb_gpu_bytes=(_gb_gpu_bytes)GetProcAddress(hinstLib,"gb_gpu_bytes");
 #endif
 }
 /* ----------------------------------------------------------------------
@ -156,26 +80,8 @@ PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0),
 PairGayBerneGPU::~PairGayBerneGPU()
 {
-  if (output_time) {
+  gb_gpu_clear();
-    printf("\n\n-------------------------------------");
+  cpu_time = 0.0;
    printf("--------------------------------\n");
    printf("      GPU Time Stamps (on proc 0): ");
    printf("\n-------------------------------------");
    printf("--------------------------------\n");
    gb_gpu_time(my_thread);
    printf("-------------------------------------");
    printf("--------------------------------\n\n");
  }
  #pragma omp parallel
  {
    #ifdef GB_GPU_OMP
    int my_thread=omp_get_thread_num();
    #endif
    gb_gpu_clear(my_thread);
    if (ij_new[my_thread]!=NULL)
      delete [] ij_new[my_thread];
  }
 }
 /* ---------------------------------------------------------------------- */
@ -183,141 +89,38 @@ PairGayBerneGPU::~PairGayBerneGPU()
 void PairGayBerneGPU::compute(int eflag, int vflag)
 {
  if (eflag || vflag) ev_setup(eflag,vflag);
-  else evflag = vflag_fdotr = eflag_atom = vflag_atom = 0;
+  else evflag = vflag_fdotr = 0;
  if (vflag_atom) 
    error->all("Per-atom virial not available with GPU Gay-Berne");
-  int nlocal = atom->nlocal;
+  int nall = atom->nlocal + atom->nghost;
-  int nall = nlocal + atom->nghost;
+  int inum, host_start;
-  int inum = list->inum;
+
-  
+  bool success = true;
-  bool rebuild=false;
+
-  if (neighbor->ncalls > last_neighbor) {
+  if (gpu_mode == GPU_NEIGH) {
-    last_neighbor=neighbor->ncalls;
+    inum = atom->nlocal;
-    rebuild=true;
+    gpulist = gb_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
 			       atom->x, atom->type, domain->sublo, domain->subhi,
 			       eflag, vflag, eflag_atom, vflag_atom, host_start,
                               cpu_time, success, atom->quat);
  } else {
    inum = list->inum;
    olist = gb_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
 			   atom->type, list->ilist, list->numneigh,
 			   list->firstneigh, eflag, vflag, eflag_atom,
                           vflag_atom, host_start, cpu_time, success,
                           atom->quat);
  }
  #pragma omp parallel
  {
  bool success=true;
  #ifdef GB_GPU_OMP
  int my_thread=omp_get_thread_num();
  if (rebuild) {
    omp_chunk=static_cast<int>(ceil(static_cast<double>(inum)/nthreads));
    if (my_thread==nthreads-1)
      thread_inum[my_thread]=inum-(nthreads-1)*omp_chunk;
    else
      thread_inum[my_thread]=omp_chunk;
    olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, 
                                        thread_inum[my_thread],  
                                        list->ilist+omp_chunk*my_thread, 
                                        list->numneigh, atom->type, my_thread,
                                        success);
  }
  #else
  if (rebuild)
    olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, inum, list->ilist, 
                                        list->numneigh, atom->type, my_thread,
                                        success);
  #endif
  if (!success)
    error->one("Out of memory on GPGPU");
  // copy atom data to GPU
  gb_gpu_atom(atom->x,atom->quat,atom->type,rebuild,my_thread);
-  int i,j,ii,jj,jnum;
+  if (host_start < inum) {
-  double factor_lj;
+    cpu_time = MPI_Wtime();
-  int *jlist;
+    if (gpu_mode == GPU_NEIGH)
-
+      cpu_compute(gpulist,host_start,eflag,vflag);
-  if (rebuild==true) {
+    else
-    int num_ij = 0;
+      cpu_compute(host_start,eflag,vflag);
-
+    cpu_time = MPI_Wtime() - cpu_time;
    // loop over neighbors of my atoms
    int *ijp=ij_new[my_thread];
    #ifdef GB_GPU_OMP
    int mgo=my_thread*omp_chunk;
    int mgot=mgo+thread_inum[my_thread];
    #else
    int mgo=0, mgot=inum;
    #endif
    for (ii = mgo; ii<mgot; ii++) {
      i = olist[my_thread][ii];
      jlist = list->firstneigh[i];
      jnum = list->numneigh[i];
      for (jj = 0; jj < jnum; jj++) {
        j = jlist[jj];
        *ijp=j;
        ijp++;
        num_ij++;
        if (num_ij==ij_size) {
          gb_gpu_nbors(ij_new[my_thread],num_ij,eflag,my_thread);
          ijp=ij_new[my_thread];
          num_ij=0;
        }
      }
    }
    if (num_ij>0)
      gb_gpu_nbors(ij_new[my_thread],num_ij,eflag,my_thread);
  }
  gb_gpu_gayberne(eflag,vflag,rebuild,my_thread);
  double lvirial[6];
  for (int i=0; i<6; i++) lvirial[i]=0.0;
  double my_eng=gb_gpu_forces(atom->f,atom->torque,olist[my_thread],eflag,vflag,
                              eflag_atom, vflag_atom, eatom, vatom, lvirial,
                              my_thread);
  #pragma omp critical
  {
    eng_vdwl+=my_eng;
    virial[0]+=lvirial[0];
    virial[1]+=lvirial[1];
    virial[2]+=lvirial[2];
    virial[3]+=lvirial[3];
    virial[4]+=lvirial[4];
    virial[5]+=lvirial[5];
  }
  } //End parallel
  if (vflag_fdotr) virial_compute();
 }
 /* ----------------------------------------------------------------------
   global settings
 ------------------------------------------------------------------------- */
 void PairGayBerneGPU::settings(int narg, char **arg)
 {
  // strip off GPU keyword/value and send remaining args to parent
  if (narg < 2) error->all("Illegal pair_style command");
  // set multi_gpu_mode to one/node for multiple gpus on 1 node
  // -- param is starting gpu id
  // set multi_gpu_mode to one/gpu to select the same gpu id on every node
  // -- param is id of gpu
  // set multi_gpu_mode to multi/gpu to get ma
  // -- param is number of gpus per node
  if (strcmp(arg[0],"one/node") == 0)
    multi_gpu_mode = ONE_NODE;
  else if (strcmp(arg[0],"one/gpu") == 0)
    multi_gpu_mode = ONE_GPU;
  else if (strcmp(arg[0],"multi/gpu") == 0)
    multi_gpu_mode = MULTI_GPU;
  else error->all("Illegal pair_style command");
  multi_gpu_param = atoi(arg[1]);
  if (multi_gpu_mode == MULTI_GPU && multi_gpu_param < 1)
    error->all("Illegal pair_style command");
  PairGayBerne::settings(narg-2,&arg[2]);
 }
 /* ----------------------------------------------------------------------
@ -326,8 +129,6 @@ void PairGayBerneGPU::settings(int narg, char **arg)
 void PairGayBerneGPU::init_style()
 {
  if (comm->me == 0)
    output_time=true;
  if (force->pair_match("gpu",0) == NULL)
    error->all("Cannot use pair hybrid with multiple GPU pair styles");
  if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type)
@ -335,24 +136,7 @@ void PairGayBerneGPU::init_style()
  if (atom->radius_flag)
    error->all("Pair gayberne cannot be used with atom attribute diameter");
  // set the GPU ID
  int my_gpu=comm->me+multi_gpu_param;
  int ngpus=universe->nprocs;
  if (multi_gpu_mode==ONE_GPU) {
    my_gpu=multi_gpu_param;
    ngpus=1;
  } else if (multi_gpu_mode==MULTI_GPU) {
    ngpus=multi_gpu_param;
    my_gpu=comm->me%ngpus;
    if (ngpus>universe->nprocs)
      ngpus=universe->nprocs;
  }
  int irequest = neighbor->request(this);
  // per-type shape precalculations
  for (int i = 1; i <= atom->ntypes; i++) {
    if (setwell[i]) {
      double *one = atom->shape[i];
@ -364,72 +148,38 @@ void PairGayBerneGPU::init_style()
  }
  // Repeat cutsq calculation because done after call to init_style
  double maxcut = -1.0;
  double cut;
-  for (int i = 1; i <= atom->ntypes; i++)
+  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      cut = init_one(i,j);
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
-      cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cut = init_one(i,j);
        cut *= cut;
        if (cut > maxcut)
          maxcut = cut;
        cutsq[i][j] = cutsq[j][i] = cut;
      } else
        cutsq[i][j] = cutsq[j][i] = 0.0;
    }
  }
-  // If compiled with OpenMP and only 1 proc, try to use multiple GPUs w/threads
+  double cell_size = sqrt(maxcut) + neighbor->skin;
-  #ifdef GB_GPU_OMP
+
-  if (multi_gpu_mode!=ONE_GPU)
+  bool init_ok = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, 
    nthreads=ngpus=gb_gpu_num_devices();
  else
    nthreads=ngpus=1;
  if (nthreads>MAX_GPU_THREADS)
    nthreads=MAX_GPU_THREADS;
  omp_set_num_threads(nthreads);
  #endif
  #pragma omp parallel firstprivate(my_gpu)
  {
    #ifdef GB_GPU_OMP
    int my_thread = omp_get_thread_num();
    if (multi_gpu_mode!=ONE_GPU)
      my_gpu=my_thread;
    if (multi_gpu_mode==ONE_NODE)
      my_gpu+=multi_gpu_param;
    #endif
    bool init_ok=gb_gpu_init(ij_size, atom->ntypes+1, gamma, upsilon, mu, 
                             shape, well, cutsq, sigma, epsilon, lshape, form,
                             lj1, lj2, lj3, lj4, offset, force->special_lj, 
                             atom->nlocal, atom->nlocal+atom->nghost, 300, 
-                             my_thread, my_gpu);
+                             cell_size, gpu_mode, screen);
-    if (!init_ok)
+  if (!init_ok)
-      error->one("At least 1 proc could not allocate a CUDA gpu or memory");
+    error->one("Insufficient memory on accelerator (or no fix gpu).");
-    
+
    if (ij_new[my_thread]!=NULL)
      delete [] ij_new[my_thread];
    ij_new[my_thread]=new int[ij_size];
  }
  last_neighbor = -1;
  neighbor->requests[irequest]->half = 0;
  neighbor->requests[irequest]->full = 1;
  if (force->newton_pair) 
-    error->all("Cannot use newton pair with GPU GayBerne pair style");
+    error->all("Cannot use newton pair with GPU Gay-Berne pair style");
-  if (comm->me == 0 && screen) {
+  if (gpu_mode != GPU_NEIGH) {
-    printf("\n-------------------------------------");
+    int irequest = neighbor->request(this);
-    printf("-------------------------------------\n");
+    neighbor->requests[irequest]->half = 0;
-    printf("- Using GPGPU acceleration for Gay-Berne:\n");
+    neighbor->requests[irequest]->full = 1;
    printf("-------------------------------------");
    printf("-------------------------------------\n");
    for (int i=0; i<ngpus; i++) {
      int gpui=my_gpu;
      if (multi_gpu_mode==ONE_NODE)
        gpui=i+multi_gpu_param;
      else if (multi_gpu_mode==MULTI_GPU)
        gpui=i;
      char gpu_string[500];
      gb_gpu_name(gpui,neighbor->oneatom,gpu_string);
      printf("GPU %d: %s\n",gpui,gpu_string);         
    }
    printf("-------------------------------------");
    printf("-------------------------------------\n\n");
  }
 }
@ -437,6 +187,275 @@ void PairGayBerneGPU::init_style()
 double PairGayBerneGPU::memory_usage()
 {
-  double bytes=Pair::memory_usage()+nthreads*ij_size*sizeof(int);
+  double bytes = Pair::memory_usage();
-  return bytes+gb_gpu_bytes();
+  return bytes + gb_gpu_bytes();
 }
 /* ---------------------------------------------------------------------- */
 void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag)
 {
  int i,j,ii,jj,inum,jnum,itype,jtype;
  double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
  double fforce[3],ttor[3],rtor[3],r12[3];
  double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3];
  int *ilist,*jlist,*numneigh,**firstneigh;
  double **x = atom->x;
  double **f = atom->f;
  double **quat = atom->quat;
  double **tor = atom->torque;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  double *special_lj = force->special_lj;
  inum = list->inum;
  ilist = olist;
  numneigh = list->numneigh;
  firstneigh = list->firstneigh;
  // loop over neighbors of my atoms
  for (ii = start; ii < inum; ii++) {
    i = ilist[ii];
    itype = type[i];
    if (form[itype][itype] == ELLIPSE_ELLIPSE) {
      MathExtra::quat_to_mat_trans(quat[i],a1);
      MathExtra::diag_times3(well[itype],a1,temp);
      MathExtra::transpose_times3(a1,temp,b1);
      MathExtra::diag_times3(shape[itype],a1,temp);
      MathExtra::transpose_times3(a1,temp,g1);
    }
    jlist = firstneigh[i];
    jnum = numneigh[i];
    for (jj = 0; jj < jnum; jj++) {
      j = jlist[jj];
      if (j < nall) factor_lj = 1.0;
      else {
        factor_lj = special_lj[j/nall];
        j %= nall;
      }
      // r12 = center to center vector
      r12[0] = x[j][0]-x[i][0];
      r12[1] = x[j][1]-x[i][1];
      r12[2] = x[j][2]-x[i][2];
      rsq = MathExtra::dot3(r12,r12);
      jtype = type[j];
      // compute if less than cutoff
      if (rsq < cutsq[itype][jtype]) {
 	switch (form[itype][jtype]) {
 	case SPHERE_SPHERE:
 	  r2inv = 1.0/rsq;
 	  r6inv = r2inv*r2inv*r2inv;
 	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
 	  forcelj *= -r2inv;
 	  if (eflag) one_eng = 
 	    r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
 	    offset[itype][jtype];
 	  fforce[0] = r12[0]*forcelj;
 	  fforce[1] = r12[1]*forcelj;
 	  fforce[2] = r12[2]*forcelj;
 	  ttor[0] = ttor[1] = ttor[2] = 0.0;
 	  rtor[0] = rtor[1] = rtor[2] = 0.0;
 	  break;
        case SPHERE_ELLIPSE:
 	  MathExtra::quat_to_mat_trans(quat[j],a2);
 	  MathExtra::diag_times3(well[jtype],a2,temp);
 	  MathExtra::transpose_times3(a2,temp,b2);
 	  MathExtra::diag_times3(shape[jtype],a2,temp);
 	  MathExtra::transpose_times3(a2,temp,g2);
 	  one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor);
 	  ttor[0] = ttor[1] = ttor[2] = 0.0;
 	  break;
        case ELLIPSE_SPHERE:
 	  one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor);
 	  rtor[0] = rtor[1] = rtor[2] = 0.0;
 	  break;
 	default:
 	  MathExtra::quat_to_mat_trans(quat[j],a2);
 	  MathExtra::diag_times3(well[jtype],a2,temp);
 	  MathExtra::transpose_times3(a2,temp,b2);
 	  MathExtra::diag_times3(shape[jtype],a2,temp);
 	  MathExtra::transpose_times3(a2,temp,g2);
 	  one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq,
 				      fforce,ttor,rtor);
 	  break;
 	}
        fforce[0] *= factor_lj;
 	fforce[1] *= factor_lj;
 	fforce[2] *= factor_lj;
        ttor[0] *= factor_lj;
 	ttor[1] *= factor_lj;
 	ttor[2] *= factor_lj;
        f[i][0] += fforce[0];
 	f[i][1] += fforce[1];
 	f[i][2] += fforce[2];
        tor[i][0] += ttor[0];
 	tor[i][1] += ttor[1];
 	tor[i][2] += ttor[2];
        if (eflag) evdwl = factor_lj*one_eng;
 	if (evflag) ev_tally_xyz_full(i,evdwl,0.0,fforce[0],fforce[1],fforce[2],
 				      -r12[0],-r12[1],-r12[2]);
      }
    }
  }
 }
 /* ---------------------------------------------------------------------- */
 void PairGayBerneGPU::cpu_compute(int *nbors, int start, int eflag, int vflag)
 {
  int i,j,itype,jtype;
  double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
  double fforce[3],ttor[3],rtor[3],r12[3];
  double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3];
  double **x = atom->x;
  double **f = atom->f;
  double **quat = atom->quat;
  double **tor = atom->torque;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  int stride = nlocal-start;
  double *special_lj = force->special_lj;
  // loop over neighbors of my atoms
  for (i = start; i < nlocal; i++) {
    itype = type[i];
    if (form[itype][itype] == ELLIPSE_ELLIPSE) {
      MathExtra::quat_to_mat_trans(quat[i],a1);
      MathExtra::diag_times3(well[itype],a1,temp);
      MathExtra::transpose_times3(a1,temp,b1);
      MathExtra::diag_times3(shape[itype],a1,temp);
      MathExtra::transpose_times3(a1,temp,g1);
    }
    int *nbor = nbors+i-start;
    int jnum =* nbor;
    nbor += stride;
    int *nbor_end = nbor + stride * jnum;
    for ( ; nbor < nbor_end; nbor += stride) {
      j = *nbor;
      if (j < nall) factor_lj = 1.0;
      else {
        factor_lj = special_lj[j/nall];
        j %= nall;
      }
      // r12 = center to center vector
      r12[0] = x[j][0]-x[i][0];
      r12[1] = x[j][1]-x[i][1];
      r12[2] = x[j][2]-x[i][2];
      rsq = MathExtra::dot3(r12,r12);
      jtype = type[j];
      // compute if less than cutoff
      if (rsq < cutsq[itype][jtype]) {
 	switch (form[itype][jtype]) {
 	case SPHERE_SPHERE:
 	  r2inv = 1.0/rsq;
 	  r6inv = r2inv*r2inv*r2inv;
 	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
 	  forcelj *= -r2inv;
 	  if (eflag) one_eng = 
 	    r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
 	    offset[itype][jtype];
 	  fforce[0] = r12[0]*forcelj;
 	  fforce[1] = r12[1]*forcelj;
 	  fforce[2] = r12[2]*forcelj;
 	  ttor[0] = ttor[1] = ttor[2] = 0.0;
 	  rtor[0] = rtor[1] = rtor[2] = 0.0;
 	  break;
        case SPHERE_ELLIPSE:
 	  MathExtra::quat_to_mat_trans(quat[j],a2);
 	  MathExtra::diag_times3(well[jtype],a2,temp);
 	  MathExtra::transpose_times3(a2,temp,b2);
 	  MathExtra::diag_times3(shape[jtype],a2,temp);
 	  MathExtra::transpose_times3(a2,temp,g2);
 	  one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor);
 	  ttor[0] = ttor[1] = ttor[2] = 0.0;
 	  break;
        case ELLIPSE_SPHERE:
 	  one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor);
 	  rtor[0] = rtor[1] = rtor[2] = 0.0;
 	  break;
 	default:
 	  MathExtra::quat_to_mat_trans(quat[j],a2);
 	  MathExtra::diag_times3(well[jtype],a2,temp);
 	  MathExtra::transpose_times3(a2,temp,b2);
 	  MathExtra::diag_times3(shape[jtype],a2,temp);
 	  MathExtra::transpose_times3(a2,temp,g2);
 	  one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq,
 				      fforce,ttor,rtor);
 	  break;
 	}
        fforce[0] *= factor_lj;
 	fforce[1] *= factor_lj;
 	fforce[2] *= factor_lj;
        ttor[0] *= factor_lj;
 	ttor[1] *= factor_lj;
 	ttor[2] *= factor_lj;
        f[i][0] += fforce[0];
 	f[i][1] += fforce[1];
 	f[i][2] += fforce[2];
        tor[i][0] += ttor[0];
 	tor[i][1] += ttor[1];
 	tor[i][2] += ttor[2];
        if (eflag) evdwl = factor_lj*one_eng;
        if (j<start) { 
  	  if (evflag) ev_tally_xyz_full(i,evdwl,0.0,fforce[0],fforce[1],
 				        fforce[2],-r12[0],-r12[1],-r12[2]);
        } else {
          if (j < nlocal) {
            rtor[0] *= factor_lj;
 	    rtor[1] *= factor_lj;
 	    rtor[2] *= factor_lj;
            f[j][0] -= fforce[0];
 	    f[j][1] -= fforce[1];
 	    f[j][2] -= fforce[2];
            tor[j][0] += rtor[0];
 	    tor[j][1] += rtor[1];
 	    tor[j][2] += rtor[2];
          }
  	  if (evflag) ev_tally_xyz(i,j,nlocal,0,
 	  			   evdwl,0.0,fforce[0],fforce[1],fforce[2],
 				   -r12[0],-r12[1],-r12[2]);
        }
      }
    }
  }
 }
--- a/src/GPU/pair_gayberne_gpu.h
+++ b/src/GPU/pair_gayberne_gpu.h
@ -29,21 +29,19 @@ class PairGayBerneGPU : public PairGayBerne {
 public:
  PairGayBerneGPU(LAMMPS *lmp);
  ~PairGayBerneGPU();
  void cpu_compute(int, int, int);
  void cpu_compute(int *, int, int, int);
  void compute(int, int);
  void settings(int, char **);
  void init_style();
  double memory_usage();
  enum { ONE_NODE, ONE_GPU, MULTI_GPU };
- private:  
+  enum { GPU_PAIR, GPU_NEIGH };
-  int ij_size;
+
-  int *ij_new[MAX_GPU_THREADS], *olist[MAX_GPU_THREADS];
+ private:
- 
+  int *olist;
-  int my_thread, nthreads, thread_inum[MAX_GPU_THREADS], omp_chunk;
+  int gpu_mode;
- 
+  double cpu_time;
-  int last_neighbor, multi_gpu_mode, multi_gpu_param;
+  int *gpulist;
  bool output_time;
 };
 }
--- a/src/GPU/pair_lj96_cut_gpu.cpp
+++ b/src/GPU/pair_lj96_cut_gpu.cpp
@ -0,0 +1,318 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Mike Brown (SNL)
 ------------------------------------------------------------------------- */
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "pair_lj96_cut_gpu.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "integrate.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
 #include "update.h"
 #include "domain.h"
 #include "string.h"
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 // External functions from cuda library for atom decomposition
 bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   double **host_lj2, double **host_lj3, double **host_lj4, 
                   double **offset, double *special_lj, const int nlocal, 
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen);
 void lj96_gpu_clear();
 int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum,
 	 	         const int nall, double **host_x, int *host_type, 
                         double *boxlo, double *boxhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         const double cpu_time, bool &success);
 void lj96_gpu_compute(const int timestep, const int ago, const int inum,
 	 	      const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh,
 		      const bool eflag, const bool vflag, const bool eatom,
                      const bool vatom, int &host_start, const double cpu_time,
                      bool &success);
 double lj96_gpu_bytes();
 using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 PairLJ96CutGPU::PairLJ96CutGPU(LAMMPS *lmp) : PairLJ96Cut(lmp), gpu_mode(GPU_PAIR)
 {
  respa_enable = 0;
  cpu_time = 0.0;
 }
 /* ----------------------------------------------------------------------
   free all arrays
 ------------------------------------------------------------------------- */
 PairLJ96CutGPU::~PairLJ96CutGPU()
 {
  lj96_gpu_clear();
 }
 /* ---------------------------------------------------------------------- */
 void PairLJ96CutGPU::compute(int eflag, int vflag)
 {
  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = vflag_fdotr = 0;
  int nall = atom->nlocal + atom->nghost;
  int inum, host_start;
  bool success = true;
  if (gpu_mode == GPU_NEIGH) {
    inum = atom->nlocal;
    gpulist = lj96_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
 			         atom->x, atom->type, domain->sublo,
 				 domain->subhi, atom->tag, atom->nspecial,
                                 atom->special, eflag, vflag, eflag_atom,
                                 vflag_atom, host_start, cpu_time, success);
  } else {
    inum = list->inum;
    lj96_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
 		     atom->type, list->ilist, list->numneigh, list->firstneigh,
 		     eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
                     success);
  }
  if (!success)
    error->one("Out of memory on GPGPU");
  if (host_start<inum) {
    cpu_time = MPI_Wtime();
    if (gpu_mode == GPU_NEIGH)
      cpu_compute(gpulist, host_start, eflag, vflag);
    else
      cpu_compute(host_start, eflag, vflag);
    cpu_time = MPI_Wtime() - cpu_time;
  }
 }
 /* ----------------------------------------------------------------------
   init specific to this pair style
 ------------------------------------------------------------------------- */
 void PairLJ96CutGPU::init_style()
 {
  cut_respa = NULL;
  if (force->pair_match("gpu",0) == NULL)
    error->all("Cannot use pair hybrid with multiple GPU pair styles");
  // Repeat cutsq calculation because done after call to init_style
  double maxcut = -1.0;
  double cut;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
        cut = init_one(i,j);
        cut *= cut;
        if (cut > maxcut)
          maxcut = cut;
        cutsq[i][j] = cutsq[j][i] = cut;
      } else
        cutsq[i][j] = cutsq[j][i] = 0.0;
    }
  }
  double cell_size = sqrt(maxcut) + neighbor->skin;
  int maxspecial=0;
  if (atom->molecular)
    maxspecial=atom->maxspecial;
  bool init_ok = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
                               atom->nlocal+atom->nghost, 300, maxspecial,
                               cell_size, gpu_mode, screen);
  if (!init_ok)
    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
  if (force->newton_pair) 
    error->all("Cannot use newton pair with GPU LJ96 pair style");
  if (gpu_mode != GPU_NEIGH) {
    int irequest = neighbor->request(this);
    neighbor->requests[irequest]->half = 0;
    neighbor->requests[irequest]->full = 1;
  }
 }
 /* ---------------------------------------------------------------------- */
 double PairLJ96CutGPU::memory_usage()
 {
  double bytes = Pair::memory_usage();
  return bytes + lj96_gpu_bytes();
 }
 /* ---------------------------------------------------------------------- */
 void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) {
  int i,j,ii,jj,inum,jnum,itype,jtype;
  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
  double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj;
  int *ilist,*jlist,*numneigh,**firstneigh;
  double **x = atom->x;
  double **f = atom->f;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  double *special_lj = force->special_lj;
  inum = list->inum;
  ilist = list->ilist;
  numneigh = list->numneigh;
  firstneigh = list->firstneigh;
  // loop over neighbors of my atoms
  for (ii = start; ii < inum; ii++) {
    i = ilist[ii];
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    jlist = firstneigh[i];
    jnum = numneigh[i];
    for (jj = 0; jj < jnum; jj++) {
      j = jlist[jj];
      if (j < nall) factor_lj = 1.0;
      else {
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 	r6inv = r2inv*r2inv*r2inv;
 	r3inv = sqrt(r6inv);
 	forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
 	fpair = factor_lj*forcelj*r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (eflag) {
 	  evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
 	    offset[itype][jtype];
 	  evdwl *= factor_lj;
 	}
 	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
      }
    }
  }
 }
 /* ---------------------------------------------------------------------- */
 void PairLJ96CutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
  int i,j,itype,jtype;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  int stride = nlocal-start;
  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
  double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj;
  double *special_lj = force->special_lj;
  double **x = atom->x;
  double **f = atom->f;
  int *type = atom->type;
  // loop over neighbors of my atoms
  for (i = start; i < nlocal; i++) {
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    int *nbor = nbors + i - start;
    int jnum = *nbor;
    nbor += stride;
    int *nbor_end = nbor + stride * jnum;
    for (; nbor<nbor_end; nbor+=stride) {
      j = *nbor;
      if (j < nall) factor_lj = 1.0;
      else {
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 	r6inv = r2inv*r2inv*r2inv;
 	r3inv = sqrt(r6inv);
 	forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
 	fpair = factor_lj*forcelj*r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (eflag) {
 	  evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
 	    offset[itype][jtype];
 	  evdwl *= factor_lj;
 	}
        if (j<start) {
  	  if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
        } else {
          if (j<nlocal) {
 	    f[j][0] -= delx*fpair;
 	    f[j][1] -= dely*fpair;
 	    f[j][2] -= delz*fpair;
  	  }
 	  if (evflag) ev_tally(i,j,nlocal,0,
 			       evdwl,0.0,fpair,delx,dely,delz);
 	}
      }
    }
  }
 }
--- a/src/GPU/pair_lj96_cut_gpu.h
+++ b/src/GPU/pair_lj96_cut_gpu.h
@ -0,0 +1,48 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(lj96/cut/gpu,PairLJ96CutGPU)
 #else
 #ifndef LMP_PAIR_LJ_96_GPU_H
 #define LMP_PAIR_LJ_96_GPU_H
 #include "pair_lj96_cut.h"
 namespace LAMMPS_NS {
 class PairLJ96CutGPU : public PairLJ96Cut {
 public:
  PairLJ96CutGPU(LAMMPS *lmp);
  ~PairLJ96CutGPU();
  void cpu_compute(int, int, int);
  void cpu_compute(int *, int, int, int);
  void compute(int, int);
  void init_style();
  double memory_usage();
 enum { GPU_PAIR, GPU_NEIGH };
 private:
  int gpu_mode;
  double cpu_time;
  int *gpulist;
 };
 }
 #endif
 #endif
--- a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
@ -0,0 +1,364 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Mike Brown (SNL)
 ------------------------------------------------------------------------- */
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "pair_lj_cut_coul_cut_gpu.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "integrate.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
 #include "update.h"
 #include "domain.h"
 #include "string.h"
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 // External functions from cuda library for atom decomposition
 bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4, 
                  double **offset, double *special_lj, const int nlocal, 
                  const int nall, const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen,
                  double **host_cut_ljsq, double **host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e);
 void ljc_gpu_clear();
 int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum,
 	 	        const int nall, double **host_x, int *host_type, 
                        double *boxlo, double *boxhi, int *tag, int **nspecial,
                        int **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        const double cpu_time, bool &success, double *host_q);
 void ljc_gpu_compute(const int timestep, const int ago, const int inum,
 	 	     const int nall, double **host_x, int *host_type,
                     int *ilist, int *numj, int **firstneigh,
 		     const bool eflag, const bool vflag, const bool eatom,
                     const bool vatom, int &host_start, const double cpu_time,
                     bool &success, double *host_q);
 double ljc_gpu_bytes();
 using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 PairLJCutCoulCutGPU::PairLJCutCoulCutGPU(LAMMPS *lmp) : PairLJCutCoulCut(lmp), gpu_mode(GPU_PAIR)
 {
  respa_enable = 0;
  cpu_time = 0.0;
 }
 /* ----------------------------------------------------------------------
   free all arrays
 ------------------------------------------------------------------------- */
 PairLJCutCoulCutGPU::~PairLJCutCoulCutGPU()
 {
  ljc_gpu_clear();
 }
 /* ---------------------------------------------------------------------- */
 void PairLJCutCoulCutGPU::compute(int eflag, int vflag)
 {
  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = vflag_fdotr = 0;
  int nall = atom->nlocal + atom->nghost;
  int inum, host_start;
  bool success = true;
  if (gpu_mode == GPU_NEIGH) {
    inum = atom->nlocal;
    gpulist = ljc_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
 			        atom->x, atom->type, domain->sublo,
 				domain->subhi, atom->tag, atom->nspecial,
                                atom->special, eflag, vflag, eflag_atom,
                                vflag_atom, host_start, cpu_time, success,
                                atom->q);
  } else {
    inum = list->inum;
    ljc_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
 		    atom->type, list->ilist, list->numneigh, list->firstneigh,
 		    eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
                    success, atom->q);
  }
  if (!success)
    error->one("Out of memory on GPGPU");
  if (host_start<inum) {
    cpu_time = MPI_Wtime();
    if (gpu_mode == GPU_NEIGH)
      cpu_compute(gpulist, host_start, eflag, vflag);
    else
      cpu_compute(host_start, eflag, vflag);
    cpu_time = MPI_Wtime() - cpu_time;
  }
 }
 /* ----------------------------------------------------------------------
   init specific to this pair style
 ------------------------------------------------------------------------- */
 void PairLJCutCoulCutGPU::init_style()
 {
  if (!atom->q_flag)
    error->all("Pair style lj/cut/coul/cut requires atom attribute q");
  if (force->pair_match("gpu",0) == NULL)
    error->all("Cannot use pair hybrid with multiple GPU pair styles");
  // Repeat cutsq calculation because done after call to init_style
  double maxcut = -1.0;
  double cut;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
        cut = init_one(i,j);
        cut *= cut;
        if (cut > maxcut)
          maxcut = cut;
        cutsq[i][j] = cutsq[j][i] = cut;
      } else
        cutsq[i][j] = cutsq[j][i] = 0.0;
    }
  }
  double cell_size = sqrt(maxcut) + neighbor->skin;
  int maxspecial=0;
  if (atom->molecular)
    maxspecial=atom->maxspecial;
  bool init_ok = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
                              atom->nlocal+atom->nghost, 300, maxspecial,
                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                              force->special_coul, force->qqrd2e);
  if (!init_ok)
    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
  if (force->newton_pair) 
    error->all("Cannot use newton pair with GPU LJ pair style");
  if (gpu_mode != GPU_NEIGH) {
    int irequest = neighbor->request(this);
    neighbor->requests[irequest]->half = 0;
    neighbor->requests[irequest]->full = 1;
  }
 }
 /* ---------------------------------------------------------------------- */
 double PairLJCutCoulCutGPU::memory_usage()
 {
  double bytes = Pair::memory_usage();
  return bytes + ljc_gpu_bytes();
 }
 /* ---------------------------------------------------------------------- */
 void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag)
 {
  int i,j,ii,jj,inum,jnum,itype,jtype;
  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
  double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
  int *ilist,*jlist,*numneigh,**firstneigh;
  evdwl = ecoul = 0.0;
  double **x = atom->x;
  double **f = atom->f;
  double *q = atom->q;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  double *special_coul = force->special_coul;
  double *special_lj = force->special_lj;
  int newton_pair = force->newton_pair;
  double qqrd2e = force->qqrd2e;
  inum = list->inum;
  ilist = list->ilist;
  numneigh = list->numneigh;
  firstneigh = list->firstneigh;
  // loop over neighbors of my atoms
  for (ii = start; ii < inum; ii++) {
    i = ilist[ii];
    qtmp = q[i];
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    jlist = firstneigh[i];
    jnum = numneigh[i];
    for (jj = 0; jj < jnum; jj++) {
      j = jlist[jj];
      if (j < nall) factor_coul = factor_lj = 1.0;
      else {
 	factor_coul = special_coul[j/nall];
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 	if (rsq < cut_coulsq[itype][jtype])
 	  forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
 	else forcecoul = 0.0;
 	if (rsq < cut_ljsq[itype][jtype]) {
 	  r6inv = r2inv*r2inv*r2inv;
 	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
 	} else forcelj = 0.0;
 	fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (eflag) {
 	  if (rsq < cut_coulsq[itype][jtype])
 	    ecoul = factor_coul * qqrd2e * qtmp*q[j]*sqrt(r2inv);
 	  else ecoul = 0.0;
 	  if (rsq < cut_ljsq[itype][jtype]) {
 	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
 	      offset[itype][jtype];
 	    evdwl *= factor_lj;
 	  } else evdwl = 0.0;
 	}
 	if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
      }
    }
  }
 }
 /* ---------------------------------------------------------------------- */
 void PairLJCutCoulCutGPU::cpu_compute(int *nbors, int start, int eflag,
                                      int vflag)
 {
  int i,j,jnum,itype,jtype;
  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
  double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
  evdwl = ecoul = 0.0;
  double **x = atom->x;
  double **f = atom->f;
  double *q = atom->q;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  int stride = nlocal-start;
  double *special_coul = force->special_coul;
  double *special_lj = force->special_lj;
  double qqrd2e = force->qqrd2e;
  // loop over neighbors of my atoms
  for (i = start; i < nlocal; i++) {
    qtmp = q[i];
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    int *nbor = nbors + i - start;
    jnum = *nbor;
    nbor += stride;
    int *nbor_end = nbor + stride * jnum;
    for (; nbor<nbor_end; nbor+=stride) {
      j = *nbor;
      if (j < nall) factor_coul = factor_lj = 1.0;
      else {
 	factor_coul = special_coul[j/nall];
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 	if (rsq < cut_coulsq[itype][jtype])
 	  forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
 	else forcecoul = 0.0;
 	if (rsq < cut_ljsq[itype][jtype]) {
 	  r6inv = r2inv*r2inv*r2inv;
 	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
 	} else forcelj = 0.0;
 	fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (eflag) {
 	  if (rsq < cut_coulsq[itype][jtype])
 	    ecoul = factor_coul * qqrd2e * qtmp*q[j]*sqrt(r2inv);
 	  else ecoul = 0.0;
 	  if (rsq < cut_ljsq[itype][jtype]) {
 	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
 	      offset[itype][jtype];
 	    evdwl *= factor_lj;
 	  } else evdwl = 0.0;
 	}
        if (j<start) {
  	  if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
        } else {
          if (j<nlocal) {
 	    f[j][0] -= delx*fpair;
 	    f[j][1] -= dely*fpair;
 	    f[j][2] -= delz*fpair;
  	  }
 	  if (evflag) ev_tally(i,j,nlocal,0,
 			       evdwl,ecoul,fpair,delx,dely,delz);
        }
      }
    }
  }
 }
--- a/src/GPU/pair_lj_cut_coul_cut_gpu.h
+++ b/src/GPU/pair_lj_cut_coul_cut_gpu.h
@ -0,0 +1,48 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(lj/cut/coul/cut/gpu,PairLJCutCoulCutGPU)
 #else
 #ifndef LMP_PAIR_LJ_CUT_COUL_CUT_GPU_H
 #define LMP_PAIR_LJ_CUT_COUL_CUT_GPU_H
 #include "pair_lj_cut_coul_cut.h"
 namespace LAMMPS_NS {
 class PairLJCutCoulCutGPU : public PairLJCutCoulCut {
 public:
  PairLJCutCoulCutGPU(LAMMPS *lmp);
  ~PairLJCutCoulCutGPU();
  void cpu_compute(int, int, int);
  void cpu_compute(int *, int, int, int);
  void compute(int, int);
  void init_style();
  double memory_usage();
 enum { GPU_PAIR, GPU_NEIGH };
 private:
  int gpu_mode;
  double cpu_time;
  int *gpulist;
 };
 }
 #endif
 #endif
--- a/src/GPU/pair_lj_cut_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
@ -0,0 +1,452 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Mike Brown (SNL)
 ------------------------------------------------------------------------- */
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "pair_lj_cut_coul_long_gpu.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "integrate.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
 #include "update.h"
 #include "domain.h"
 #include "string.h"
 #include "kspace.h"
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #define EWALD_F   1.12837917
 #define EWALD_P   0.3275911
 #define A1        0.254829592
 #define A2       -0.284496736
 #define A3        1.421413741
 #define A4       -1.453152027
 #define A5        1.061405429
 // External functions from cuda library for atom decomposition
 bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4, 
                  double **offset, double *special_lj, const int nlocal, 
                  const int nall, const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen,
                  double **host_cut_ljsq, double host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e,
                  const double g_ewald);
 void ljcl_gpu_clear();
 int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum,
 	 	        const int nall, double **host_x, int *host_type, 
                        double *boxlo, double *boxhi, int *tag, int **nspecial,
                        int **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        const double cpu_time, bool &success, double *host_q);
 void ljcl_gpu_compute(const int timestep, const int ago, const int inum,
 	 	     const int nall, double **host_x, int *host_type,
                     int *ilist, int *numj, int **firstneigh,
 		     const bool eflag, const bool vflag, const bool eatom,
                     const bool vatom, int &host_start, const double cpu_time,
                     bool &success, double *host_q);
 double ljcl_gpu_bytes();
 using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 PairLJCutCoulLongGPU::PairLJCutCoulLongGPU(LAMMPS *lmp) : PairLJCutCoulLong(lmp), gpu_mode(GPU_PAIR)
 {
  respa_enable = 0;
  cpu_time = 0.0;
 }
 /* ----------------------------------------------------------------------
   free all arrays
 ------------------------------------------------------------------------- */
 PairLJCutCoulLongGPU::~PairLJCutCoulLongGPU()
 {
  ljcl_gpu_clear();
 }
 /* ---------------------------------------------------------------------- */
 void PairLJCutCoulLongGPU::compute(int eflag, int vflag)
 {
  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = vflag_fdotr = 0;
  int nall = atom->nlocal + atom->nghost;
  int inum, host_start;
  bool success = true;
  if (gpu_mode == GPU_NEIGH) {
    inum = atom->nlocal;
    gpulist = ljcl_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
 			         atom->x, atom->type, domain->sublo,
 				 domain->subhi, atom->tag, atom->nspecial,
                                 atom->special, eflag, vflag, eflag_atom,
                                 vflag_atom, host_start, cpu_time, success,
                                 atom->q);
  } else {
    inum = list->inum;
    ljcl_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
 		    atom->type, list->ilist, list->numneigh, list->firstneigh,
 		    eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
                    success, atom->q);
  }
  if (!success)
    error->one("Out of memory on GPGPU");
  if (host_start<inum) {
    cpu_time = MPI_Wtime();
    if (gpu_mode == GPU_NEIGH)
      cpu_compute(gpulist, host_start, eflag, vflag);
    else
      cpu_compute(host_start, eflag, vflag);
    cpu_time = MPI_Wtime() - cpu_time;
  }
 }
 /* ----------------------------------------------------------------------
   init specific to this pair style
 ------------------------------------------------------------------------- */
 void PairLJCutCoulLongGPU::init_style()
 {
  cut_respa = NULL;
  if (!atom->q_flag)
    error->all("Pair style lj/cut/coul/cut requires atom attribute q");
  if (force->pair_match("gpu",0) == NULL)
    error->all("Cannot use pair hybrid with multiple GPU pair styles");
  // Repeat cutsq calculation because done after call to init_style
  double maxcut = -1.0;
  double cut;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
        cut = init_one(i,j);
        cut *= cut;
        if (cut > maxcut)
          maxcut = cut;
        cutsq[i][j] = cutsq[j][i] = cut;
      } else
        cutsq[i][j] = cutsq[j][i] = 0.0;
    }
  }
  double cell_size = sqrt(maxcut) + neighbor->skin;
  cut_coulsq = cut_coul * cut_coul;
  // insure use of KSpace long-range solver, set g_ewald
  if (force->kspace == NULL)
    error->all("Pair style is incompatible with KSpace style");
  g_ewald = force->kspace->g_ewald;
  // setup force tables
  if (ncoultablebits) init_tables();
  int maxspecial=0;
  if (atom->molecular)
    maxspecial=atom->maxspecial;
  bool init_ok = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
                              atom->nlocal+atom->nghost, 300, maxspecial,
                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                              force->special_coul, force->qqrd2e, g_ewald);
  if (!init_ok)
    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
  if (force->newton_pair) 
    error->all("Cannot use newton pair with GPU LJ pair style");
  if (gpu_mode != GPU_NEIGH) {
    int irequest = neighbor->request(this);
    neighbor->requests[irequest]->half = 0;
    neighbor->requests[irequest]->full = 1;
  }
 }
 /* ---------------------------------------------------------------------- */
 double PairLJCutCoulLongGPU::memory_usage()
 {
  double bytes = Pair::memory_usage();
  return bytes + ljcl_gpu_bytes();
 }
 /* ---------------------------------------------------------------------- */
 void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
 {
  int i,j,ii,jj,inum,jnum,itype,jtype,itable;
  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
  double fraction,table;
  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
  double grij,expm2,prefactor,t,erfc;
  int *ilist,*jlist,*numneigh,**firstneigh;
  double rsq;
  evdwl = ecoul = 0.0;
  double **x = atom->x;
  double **f = atom->f;
  double *q = atom->q;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  double *special_coul = force->special_coul;
  double *special_lj = force->special_lj;
  double qqrd2e = force->qqrd2e;
  inum = list->inum;
  ilist = list->ilist;
  numneigh = list->numneigh;
  firstneigh = list->firstneigh;
  // loop over neighbors of my atoms
  for (ii = start; ii < inum; ii++) {
    i = ilist[ii];
    qtmp = q[i];
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    jlist = firstneigh[i];
    jnum = numneigh[i];
    for (jj = 0; jj < jnum; jj++) {
      j = jlist[jj];
      if (j < nall) factor_coul = factor_lj = 1.0;
      else {
 	factor_coul = special_coul[j/nall];
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 	if (rsq < cut_coulsq) {
 	  if (!ncoultablebits || rsq <= tabinnersq) {
 	    r = sqrt(rsq);
 	    grij = g_ewald * r;
 	    expm2 = exp(-grij*grij);
 	    t = 1.0 / (1.0 + EWALD_P*grij);
 	    erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
 	    prefactor = qqrd2e * qtmp*q[j]/r;
 	    forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
 	    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
 	  } else {
 	    union_int_float_t rsq_lookup;
 	    rsq_lookup.f = rsq;
 	    itable = rsq_lookup.i & ncoulmask;
 	    itable >>= ncoulshiftbits;
 	    fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
 	    table = ftable[itable] + fraction*dftable[itable];
 	    forcecoul = qtmp*q[j] * table;
 	    if (factor_coul < 1.0) {
 	      table = ctable[itable] + fraction*dctable[itable];
 	      prefactor = qtmp*q[j] * table;
 	      forcecoul -= (1.0-factor_coul)*prefactor;
 	    }
 	  }
 	} else forcecoul = 0.0;
 	if (rsq < cut_ljsq[itype][jtype]) {
 	  r6inv = r2inv*r2inv*r2inv;
 	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
 	} else forcelj = 0.0;
 	fpair = (forcecoul + factor_lj*forcelj) * r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (eflag) {
 	  if (rsq < cut_coulsq) {
 	    if (!ncoultablebits || rsq <= tabinnersq)
 	      ecoul = prefactor*erfc;
 	    else {
 	      table = etable[itable] + fraction*detable[itable];
 	      ecoul = qtmp*q[j] * table;
 	    }
 	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
 	  } else ecoul = 0.0;
 	  if (rsq < cut_ljsq[itype][jtype]) {
 	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
 	      offset[itype][jtype];
 	    evdwl *= factor_lj;
 	  } else evdwl = 0.0;
 	}
 	if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
      }
    }
  }
 }
 /* ---------------------------------------------------------------------- */
 void PairLJCutCoulLongGPU::cpu_compute(int *nbors, int start, int eflag,
                                      int vflag)
 {
  int i,j,jnum,itype,jtype,itable;
  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
  double fraction,table;
  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
  double grij,expm2,prefactor,t,erfc;
  double rsq;
  evdwl = ecoul = 0.0;
  double **x = atom->x;
  double **f = atom->f;
  double *q = atom->q;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  int stride = nlocal-start;
  double *special_coul = force->special_coul;
  double *special_lj = force->special_lj;
  double qqrd2e = force->qqrd2e;
  // loop over neighbors of my atoms
  for (i = start; i < nlocal; i++) {
    qtmp = q[i];
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    int *nbor = nbors + i - start;
    jnum = *nbor;
    nbor += stride;
    int *nbor_end = nbor + stride * jnum;
    for (; nbor<nbor_end; nbor+=stride) {
      j = *nbor;
      if (j < nall) factor_coul = factor_lj = 1.0;
      else {
 	factor_coul = special_coul[j/nall];
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 	if (rsq < cut_coulsq) {
 	  if (!ncoultablebits || rsq <= tabinnersq) {
 	    r = sqrt(rsq);
 	    grij = g_ewald * r;
 	    expm2 = exp(-grij*grij);
 	    t = 1.0 / (1.0 + EWALD_P*grij);
 	    erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
 	    prefactor = qqrd2e * qtmp*q[j]/r;
 	    forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
 	    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
 	  } else {
 	    union_int_float_t rsq_lookup;
 	    rsq_lookup.f = rsq;
 	    itable = rsq_lookup.i & ncoulmask;
 	    itable >>= ncoulshiftbits;
 	    fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
 	    table = ftable[itable] + fraction*dftable[itable];
 	    forcecoul = qtmp*q[j] * table;
 	    if (factor_coul < 1.0) {
 	      table = ctable[itable] + fraction*dctable[itable];
 	      prefactor = qtmp*q[j] * table;
 	      forcecoul -= (1.0-factor_coul)*prefactor;
 	    }
 	  }
 	} else forcecoul = 0.0;
 	if (rsq < cut_ljsq[itype][jtype]) {
 	  r6inv = r2inv*r2inv*r2inv;
 	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
 	} else forcelj = 0.0;
 	fpair = (forcecoul + factor_lj*forcelj) * r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (eflag) {
 	  if (rsq < cut_coulsq) {
 	    if (!ncoultablebits || rsq <= tabinnersq)
 	      ecoul = prefactor*erfc;
 	    else {
 	      table = etable[itable] + fraction*detable[itable];
 	      ecoul = qtmp*q[j] * table;
 	    }
 	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
 	  } else ecoul = 0.0;
 	  if (rsq < cut_ljsq[itype][jtype]) {
 	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
 	      offset[itype][jtype];
 	    evdwl *= factor_lj;
 	  } else evdwl = 0.0;
 	}
        if (j<start) {
  	  if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
        } else {
          if (j<nlocal) {
 	    f[j][0] -= delx*fpair;
 	    f[j][1] -= dely*fpair;
 	    f[j][2] -= delz*fpair;
  	  }
 	  if (evflag) ev_tally(i,j,nlocal,0,
 			       evdwl,ecoul,fpair,delx,dely,delz);
        }
      }
    }
  }
 }
--- a/src/GPU/pair_lj_cut_coul_long_gpu.h
+++ b/src/GPU/pair_lj_cut_coul_long_gpu.h
@ -0,0 +1,48 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(lj/cut/coul/long/gpu,PairLJCutCoulLongGPU)
 #else
 #ifndef LMP_PAIR_LJ_CUT_COUL_LONG_GPU_H
 #define LMP_PAIR_LJ_CUT_COUL_LONG_GPU_H
 #include "pair_lj_cut_coul_long.h"
 namespace LAMMPS_NS {
 class PairLJCutCoulLongGPU : public PairLJCutCoulLong {
 public:
  PairLJCutCoulLongGPU(LAMMPS *lmp);
  ~PairLJCutCoulLongGPU();
  void cpu_compute(int, int, int);
  void cpu_compute(int *, int, int, int);
  void compute(int, int);
  void init_style();
  double memory_usage();
 enum { GPU_PAIR, GPU_NEIGH };
 private:
  int gpu_mode;
  double cpu_time;
  int *gpulist;
 };
 }
 #endif
 #endif
--- a/src/GPU/pair_lj_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_gpu.cpp
@ -2,28 +2,24 @@
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
-
+   
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-
+   
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+   Contributing author: Mike Brown (SNL)
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "pair_lj_cut_gpu.h"
 #include "math_extra.h"
 #include "atom.h"
 #include "domain.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
@ -32,79 +28,45 @@
 #include "integrate.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
-
+#include "update.h"
-#include <string>
+#include "domain.h"
 #include "string.h"
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
-#ifndef WINDLL
+// External functions from cuda library for atom decomposition
-// External functions from cuda library for force decomposition
+bool ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-
+                  double **host_lj2, double **host_lj3, double **host_lj4, 
-bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
+                  double **offset, double *special_lj, const int nlocal, 
-                  double **epsilon, double **host_lj1, double **host_lj2, 
+                  const int nall, const int max_nbors, const int maxspecial,
-                  double **host_lj3, double **host_lj4, double **offset, 
+                  const double cell_size, int &gpu_mode, FILE *screen);
-                  double *special_lj, double *boxlo, double *boxhi, double cell_len, double skin,
+void ljl_gpu_clear();
-                  const int max_nbors, const int gpu_id);
+int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum,
-void lj_gpu_clear();
+	 	        const int nall, double **host_x, int *host_type, 
-double lj_gpu_cell(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, 
+                        double *boxlo, double *boxhi, int *tag, int **nspecial,
-		   const int ago, const bool eflag, const bool vflag, 
+                        int **special, const bool eflag, const bool vflag,
-		   const double *boxlo, const double *boxhi);
+                        const bool eatom, const bool vatom, int &host_start,
-void lj_gpu_name(const int gpu_id, const int max_nbors, char * name);
+                        const double cpu_time, bool &success);
-void lj_gpu_time();
+void ljl_gpu_compute(const int timestep, const int ago, const int inum,
-double lj_gpu_bytes();
+	 	     const int nall, double **host_x, int *host_type,
-
+                     int *ilist, int *numj, int **firstneigh,
-#else
+		     const bool eflag, const bool vflag, const bool eatom,
-#include <windows.h>
+                     const bool vatom, int &host_start, const double cpu_time,
-
+                     bool &success);
-typedef bool (*_lj_gpu_init)(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
+double ljl_gpu_bytes();
                  double **epsilon, double **host_lj1, double **host_lj2, 
                  double **host_lj3, double **host_lj4, double **offset, 
                  double *special_lj, double *boxlo, double *boxhi, double cell_len, double skin,
                  const int max_nbors, const int gpu_id);
 typedef void (*_lj_gpu_clear)();
 typedef double (*_lj_gpu_cell)(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, 
 		   const int ago, const bool eflag, const bool vflag, 
 		   const double *boxlo, const double *boxhi);
 typedef void (*_lj_gpu_name)(const int gpu_id, const int max_nbors, char * name);
 typedef void (*_lj_gpu_time)();
 typedef double (*_lj_gpu_bytes)();
 _lj_gpu_init lj_gpu_init;
 _lj_gpu_clear lj_gpu_clear;
 _lj_gpu_cell lj_gpu_cell;
 _lj_gpu_name lj_gpu_name;
 _lj_gpu_time lj_gpu_time;
 _lj_gpu_bytes lj_gpu_bytes;
 #endif
 using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
-PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0)
+PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), gpu_mode(GPU_PAIR)
 {
  ij_new=NULL;
  respa_enable = 0;
-
+  cpu_time = 0.0;
 #ifdef WINDLL
  HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
  if (hinstLib == NULL) {
    printf("\nUnable to load gpu.dll\n");
    exit(1);
  }
  lj_gpu_init=(_lj_gpu_init)GetProcAddress(hinstLib,"lj_gpu_init");
  lj_gpu_clear=(_lj_gpu_clear)GetProcAddress(hinstLib,"lj_gpu_clear");
  lj_gpu_cell=(_lj_gpu_cell)GetProcAddress(hinstLib,"lj_gpu_cell");
  lj_gpu_name=(_lj_gpu_name)GetProcAddress(hinstLib,"lj_gpu_name");
  lj_gpu_time=(_lj_gpu_time)GetProcAddress(hinstLib,"lj_gpu_time");
  lj_gpu_bytes=(_lj_gpu_bytes)GetProcAddress(hinstLib,"lj_gpu_bytes");
 #endif
 }
 /* ----------------------------------------------------------------------
@ -113,17 +75,7 @@ PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0)
 PairLJCutGPU::~PairLJCutGPU()
 {
-  printf("\n\n-------------------------------------");
+  ljl_gpu_clear();
  printf("--------------------------------\n");
  printf("      GPU Time Stamps: ");
  printf("\n-------------------------------------");
  printf("--------------------------------\n");
  lj_gpu_time();
  printf("-------------------------------------");
  printf("--------------------------------\n\n");
  lj_gpu_clear();
  if (ij_new!=NULL)
    delete [] ij_new;
 }
 /* ---------------------------------------------------------------------- */
@ -132,45 +84,37 @@ void PairLJCutGPU::compute(int eflag, int vflag)
 {
  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = vflag_fdotr = 0;
  int nall = atom->nlocal + atom->nghost;
  int inum, host_start;
  bool success = true;
  if (gpu_mode == GPU_NEIGH) {
    inum = atom->nlocal;
    gpulist = ljl_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
 			        atom->x, atom->type, domain->sublo,
 				domain->subhi, atom->tag, atom->nspecial,
                                atom->special, eflag, vflag, eflag_atom,
                                vflag_atom, host_start, cpu_time, success);
  } else {
    inum = list->inum;
    ljl_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
 		    atom->type, list->ilist, list->numneigh, list->firstneigh,
 		    eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
                    success);
  }
  if (!success)
    error->one("Out of memory on GPGPU");
-  // compute forces on GPU
+  if (host_start<inum) {
-  eng_vdwl = lj_gpu_cell(atom->f, virial, atom->x, atom->type, atom->nlocal, atom->nlocal + atom->nghost, 
+    cpu_time = MPI_Wtime();
-			 neighbor->ago, eflag, vflag, domain->boxlo, domain->boxhi);
+    if (gpu_mode == GPU_NEIGH)
-
+      cpu_compute(gpulist, host_start, eflag, vflag);
-  if (vflag_fdotr) virial_compute();
+    else
-}
+      cpu_compute(host_start, eflag, vflag);
-
+    cpu_time = MPI_Wtime() - cpu_time;
-/* ----------------------------------------------------------------------
+  }
   global settings
 ------------------------------------------------------------------------- */
 void PairLJCutGPU::settings(int narg, char **arg)
 {
  // strip off GPU keyword/value and send remaining args to parent
  if (narg < 2) error->all("Illegal pair_style command");
  // set multi_gpu_mode to one/node for multiple gpus on 1 node
  // -- param is starting gpu id
  // set multi_gpu_mode to one/gpu to select the same gpu id on every node
  // -- param is id of gpu
  // set multi_gpu_mode to multi/gpu to get ma
  // -- param is number of gpus per node
  if (strcmp(arg[0],"one/node") == 0)
    multi_gpu_mode = ONE_NODE;
  else if (strcmp(arg[0],"one/gpu") == 0)
    multi_gpu_mode = ONE_GPU;
  else if (strcmp(arg[0],"multi/gpu") == 0)
    multi_gpu_mode = MULTI_GPU;
  else error->all("Illegal pair_style command");
  multi_gpu_param = atoi(arg[1]);
  if (multi_gpu_mode == MULTI_GPU && multi_gpu_param < 1)
    error->all("Illegal pair_style command");
  PairLJCut::settings(narg-2,&arg[2]);
 }
 /* ----------------------------------------------------------------------
@ -179,74 +123,45 @@ void PairLJCutGPU::settings(int narg, char **arg)
 void PairLJCutGPU::init_style()
 {
  cut_respa = NULL;
  if (force->pair_match("gpu",0) == NULL)
    error->all("Cannot use pair hybrid with multiple GPU pair styles");
  // set the GPU ID
  int my_gpu=comm->me+multi_gpu_param;
  int ngpus=universe->nprocs;
  if (multi_gpu_mode==ONE_GPU) {
    my_gpu=multi_gpu_param;
    ngpus=1;
  } else if (multi_gpu_mode==MULTI_GPU) {
    ngpus=multi_gpu_param;
    my_gpu=comm->me%ngpus;
    if (ngpus>universe->nprocs)
      ngpus=universe->nprocs;
  }
  cut_respa=NULL;
  // Repeat cutsq calculation because done after call to init_style
  double cut;
  for (int i = 1; i <= atom->ntypes; i++)
    for (int j = i; j <= atom->ntypes; j++) {
      cut = init_one(i,j);
      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  // use the max cutoff length as the cell length
  double maxcut = -1.0;
-  for (int i = 1; i <= atom->ntypes; i++)
+  double cut;
-    for (int j = i; j <= atom->ntypes; j++)
+  for (int i = 1; i <= atom->ntypes; i++) {
-      if (cutsq[i][j] > maxcut) maxcut = cutsq[i][j];
+    for (int j = i; j <= atom->ntypes; j++) {
-
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
-  // for this problem, adding skin results in better perf
+        cut = init_one(i,j);
-  // this may be a parameter in the future
+        cut *= cut;
-  double cell_len = sqrt(maxcut) + neighbor->skin;
+        if (cut > maxcut)
-
+          maxcut = cut;
-  if (!lj_gpu_init(ij_size, atom->ntypes+1, cutsq, sigma, epsilon, lj1, lj2, lj3, 
+        cutsq[i][j] = cutsq[j][i] = cut;
-                 lj4, offset, force->special_lj, domain->boxlo, domain->boxhi, 
+      } else
-                 cell_len, neighbor->skin, neighbor->oneatom, my_gpu))
+        cutsq[i][j] = cutsq[j][i] = 0.0;
    error->one("At least one process could not allocate a CUDA-enabled gpu");
  if (ij_new!=NULL)
    delete [] ij_new;
  ij_new=new int[ij_size];
  if (force->newton_pair) 
    error->all("Cannot use newton pair with GPU lj/cut pair style");
  if (comm->me == 0 && screen) {
    printf("\n-------------------------------------");
    printf("-------------------------------------\n");
    printf("- Using GPGPU acceleration for LJ-Cut:\n");
    printf("-------------------------------------");
    printf("-------------------------------------\n");
    for (int i=0; i<ngpus; i++) {
      int gpui=my_gpu;
      if (multi_gpu_mode==ONE_NODE)
        gpui=i+multi_gpu_param;
      else if (multi_gpu_mode==MULTI_GPU)
        gpui=i;
      char gpu_string[500];
      lj_gpu_name(gpui,neighbor->oneatom,gpu_string);
      printf("GPU %d: %s\n",gpui,gpu_string);   
    }
-    printf("-------------------------------------");
+  }
-    printf("-------------------------------------\n\n");
+  double cell_size = sqrt(maxcut) + neighbor->skin;
  int maxspecial=0;
  if (atom->molecular)
    maxspecial=atom->maxspecial;
  bool init_ok = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
                              atom->nlocal+atom->nghost, 300, maxspecial,
                              cell_size, gpu_mode, screen);
  if (!init_ok)
    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
  if (force->newton_pair) 
    error->all("Cannot use newton pair with GPU LJ pair style");
  if (gpu_mode != GPU_NEIGH) {
    int irequest = neighbor->request(this);
    neighbor->requests[irequest]->half = 0;
    neighbor->requests[irequest]->full = 1;
  }
 }
@ -254,6 +169,149 @@ void PairLJCutGPU::init_style()
 double PairLJCutGPU::memory_usage()
 {
-  double bytes=Pair::memory_usage()+ij_size*sizeof(int);
+  double bytes = Pair::memory_usage();
-  return bytes+lj_gpu_bytes();
+  return bytes + ljl_gpu_bytes();
 }
 /* ---------------------------------------------------------------------- */
 void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) {
  int i,j,ii,jj,inum,jnum,itype,jtype;
  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
  double rsq,r2inv,r6inv,forcelj,factor_lj;
  int *ilist,*jlist,*numneigh,**firstneigh;
  double **x = atom->x;
  double **f = atom->f;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  double *special_lj = force->special_lj;
  inum = list->inum;
  ilist = list->ilist;
  numneigh = list->numneigh;
  firstneigh = list->firstneigh;
  // loop over neighbors of my atoms
  for (ii = start; ii < inum; ii++) {
    i = ilist[ii];
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    jlist = firstneigh[i];
    jnum = numneigh[i];
    for (jj = 0; jj < jnum; jj++) {
      j = jlist[jj];
      if (j < nall) factor_lj = 1.0;
      else {
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 	r6inv = r2inv*r2inv*r2inv;
 	forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
 	fpair = factor_lj*forcelj*r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (eflag) {
 	  evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
 	    offset[itype][jtype];
 	  evdwl *= factor_lj;
 	}
 	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
      }
    }
  }
 }
 /* ---------------------------------------------------------------------- */
 void PairLJCutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
  int i,j,itype,jtype;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  int stride = nlocal-start;
  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
  double rsq,r2inv,r6inv,forcelj,factor_lj;
  double *special_lj = force->special_lj;
  double **x = atom->x;
  double **f = atom->f;
  int *type = atom->type;
  // loop over neighbors of my atoms
  for (i = start; i < nlocal; i++) {
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    int *nbor = nbors + i - start;
    int jnum = *nbor;
    nbor += stride;
    int *nbor_end = nbor + stride * jnum;
    for (; nbor<nbor_end; nbor+=stride) {
      j = *nbor;
      if (j < nall) factor_lj = 1.0;
      else {
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 	r6inv = r2inv*r2inv*r2inv;
 	forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
 	fpair = factor_lj*forcelj*r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (eflag) {
 	  evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
 	    offset[itype][jtype];
 	  evdwl *= factor_lj;
 	}
        if (j<start) {
  	  if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
        } else {
          if (j<nlocal) {
 	    f[j][0] -= delx*fpair;
 	    f[j][1] -= dely*fpair;
 	    f[j][2] -= delz*fpair;
  	  }
 	  if (evflag) ev_tally(i,j,nlocal,0,
 			       evdwl,0.0,fpair,delx,dely,delz);
 	}
      }
    }
  }
 }
--- a/src/GPU/pair_lj_cut_gpu.h
+++ b/src/GPU/pair_lj_cut_gpu.h
@ -17,8 +17,8 @@ PairStyle(lj/cut/gpu,PairLJCutGPU)
 #else
-#ifndef LMP_PAIR_LJ_CUT_GPU_H
+#ifndef LMP_PAIR_LJ_LIGHT_GPU_H
-#define LMP_PAIR_LJ_CUT_GPU_H
+#define LMP_PAIR_LJ_LIGHT_GPU_H
 #include "pair_lj_cut.h"
@ -28,20 +28,21 @@ class PairLJCutGPU : public PairLJCut {
 public:
  PairLJCutGPU(LAMMPS *lmp);
  ~PairLJCutGPU();
  void cpu_compute(int, int, int);
  void cpu_compute(int *, int, int, int);
  void compute(int, int);
  void settings(int, char **);
  void init_style();
  double memory_usage();
-  enum { ONE_NODE, ONE_GPU, MULTI_GPU };
+ enum { GPU_PAIR, GPU_NEIGH };
- private:  
+ private:
-  int ij_size;
+  int gpu_mode;
-  int *ij_new;
+  double cpu_time;
- 
+  int *gpulist;
  int last_neighbor, multi_gpu_mode, multi_gpu_param;
 };
 }
 #endif
 #endif