Merge branch 'lammps-icms' of bitbucket.org:akohlmey/lammps-icms into lammps-icms

Resolved Conflicts: doc/Manual.html doc/Manual.txt doc/package.txt
2014-08-19 09:49:46 -04:00
parent 39a94bfbdc 2e5648e42c
commit 91f0acc6ad
31 changed files with 290 additions and 342 deletions
--- a/doc/Manual.html
+++ b/doc/Manual.html
@ -1,7 +1,7 @@
 <HTML>
 <HEAD>
 <TITLE>LAMMPS-ICMS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="14 Aug 2014 version">
+<META NAME="docnumber" CONTENT="15 Aug 2014 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@ -22,7 +22,7 @@

 <CENTER><H3>LAMMPS-ICMS Documentation 
 </H3></CENTER>
-<CENTER><H4>14 Aug 2014 version 
+<CENTER><H4>15 Aug 2014 version 
 </H4></CENTER>
 <H4>Version info: 
 </H4>
--- a/doc/Manual.txt
+++ b/doc/Manual.txt
@ -1,6 +1,6 @@
 <HEAD>
 <TITLE>LAMMPS-ICMS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="14 Aug 2014 version">
+<META NAME="docnumber" CONTENT="15 Aug 2014 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@ -18,7 +18,7 @@
 <H1></H1>

 LAMMPS-ICMS Documentation :c,h3
-14 Aug 2014 version :c,h4
+15 Aug 2014 version :c,h4

 Version info: :h4

--- a/doc/Section_intro.html
+++ b/doc/Section_intro.html
@ -109,7 +109,7 @@ it to LAMMPS.
 <LI>  open-source distribution
 <LI>  highly portable C++
 <LI>  optional libraries used: MPI and single-processor FFT
-<LI>  Intel Xeon Phi, GPU (CUDA and OpenCL), and OpenMP support for many code features
+<LI>  GPU (CUDA and OpenCL), Intel Xeon Phi, and OpenMP support for many code features
 <LI>  easy to extend with new features and functionality
 <LI>  runs from an input script
 <LI>  syntax for defining and using variables and formulas
--- a/doc/Section_intro.txt
+++ b/doc/Section_intro.txt
@ -105,7 +105,7 @@ General features :h4
  open-source distribution
  highly portable C++
  optional libraries used: MPI and single-processor FFT
-  Intel Xeon Phi, GPU (CUDA and OpenCL), and OpenMP support for many code features
+  GPU (CUDA and OpenCL), Intel Xeon Phi, and OpenMP support for many code features
  easy to extend with new features and functionality
  runs from an input script
  syntax for defining and using variables and formulas
--- a/doc/package.txt
+++ b/doc/package.txt
@ -39,7 +39,7 @@ args = arguments specific to the style :l
      {cellsize} value = dist
        dist = length (distance units) in each dimension for neighbor bins
      {device} value = device_type
-        device_type = {kepler} or {fermi} or {cypress} or {phi} or {generic}
+        device_type = {kepler} or {fermi} or {cypress} or {phi} or {intel} or {generic}
  {intel} args = Nthreads precision keyword value ...
    Nthreads = # of OpenMP threads to associate with each MPI process on host
    precision = {single} or {mixed} or {double}
--- a/examples/intel/in.intel.lc
+++ b/examples/intel/in.intel.lc
@ -3,7 +3,9 @@
 # shape: 2 1.5 1
 # cutoff 4.0 with skin 0.8
 # NPT, T=2.4, P=8.0
+
 package intel * mixed balance $b
+package omp *
 suffix $s
 processors * * * grid numa

--- a/examples/intel/in.intel.rhodo
+++ b/examples/intel/in.intel.rhodo
@ -1,5 +1,7 @@
 # Rhodopsin model
+
 package intel * mixed balance $b
+package omp *
 suffix $s

 variable	x index 4
--- a/src/REPLICA/verlet_split.cpp
+++ b/src/REPLICA/verlet_split.cpp
@ -407,9 +407,11 @@ void VerletSplit::run(int n)
    // all output

    if (master) {
+      timer->stamp();
      if (n_post_force) modify->post_force(vflag);
      modify->final_integrate();
      if (n_end_of_step) modify->end_of_step();
+      timer->stamp(Timer::MODIFY);

      if (ntimestep == output->next) {
        timer->stamp();
--- a/src/USER-INTEL/Install.sh
+++ b/src/USER-INTEL/Install.sh
@ -67,41 +67,3 @@ elif (test $mode = 0) then
  touch ../accelerator_intel.h

 fi
-
-# step 3: map omp styles that are not in the intel package to intel suffix
-
-#if (test $mode = 0) then
-#
-#  rm -f ../*ompinto_intel*
-#
-#else
-#
-#  echo "  The 'intel' suffix will use the USER-OMP package for all"
-#  echo "  angle, bond, dihedral, kspace, and improper styles:"
-#  stylelist="pair fix angle bond dihedral improper"
-#  for header in $stylelist; do
-#    HEADER=`echo $header | sed 's/\(.*\)/\U\1/'`
-#    outfile=../$header"_ompinto_intel.h"
-#    echo "    Creating $header style map: $outfile"
-#    echo -n "// -- Header to map USER-OMP " > $outfile  
-#    echo "styles to the intel suffix" >> $outfile
-#    echo >> $outfile
-#    echo "#ifdef "$HEADER"_CLASS" >> $outfile
-#    grep -h 'Style(' ../$header*_omp.h | grep -v 'charmm/coul/long' | \
-#	grep -v 'lj/cut' | grep -v 'gayberne' | \
-#	sed 's/\/omp/\/intel/g' >> $outfile
-#    echo "#endif" >> $outfile
-#  done
-#
-#  header="kspace"
-#  HEADER="KSPACE"
-#  outfile=../$header"_ompinto_intel.h"
-#  echo "    Creating $header style map: $outfile"
-#  echo -n "// -- Header to map USER-OMP " > $outfile  
-#  echo "styles to the intel suffix" >> $outfile
-#  echo >> $outfile
-#  echo "#ifdef "$HEADER"_CLASS" >> $outfile
-#  grep -h 'KSpaceStyle(' ../*_omp.h | sed 's/\/omp/\/intel/g' >> $outfile
-#  echo "#endif" >> $outfile
-#
-#fi
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@ -128,7 +128,7 @@ class FixIntel : public Fix {

 protected:
  int _overflow_flag[5];
-  __declspec(align(64)) int _off_overflow_flag[5];
+  _alignvar(int _off_overflow_flag[5],64);
  int _allow_separate_buffers, _offload_ghost;
  #ifdef _LMP_INTEL_OFFLOAD
  double _balance_pair_time, _balance_other_time;
@ -155,18 +155,18 @@ class FixIntel : public Fix {
  double _offload_balance, _balance_neighbor, _balance_pair, _balance_fixed;
  double _timers[NUM_ITIMERS];
  double _stopwatch[NUM_ITIMERS];
-  __declspec(align(64)) double _stopwatch_offload_neighbor[1];
-  __declspec(align(64)) double _stopwatch_offload_pair[1];
+  _alignvar(double _stopwatch_offload_neighbor[1],64);
+  _alignvar(double _stopwatch_offload_pair[1],64);

  template <class ft, class acc_t>
-  inline void add_results(const ft * restrict const f_in,
-                          const acc_t * restrict const ev_global,
+  inline void add_results(const ft * _noalias const f_in,
+                          const acc_t * _noalias const ev_global,
                          const int eatom, const int vatom,
 			  const int offload);

  template <class ft, class acc_t>
-  inline void add_oresults(const ft * restrict const f_in,
-			   const acc_t * restrict const ev_global,
+  inline void add_oresults(const ft * _noalias const f_in,
+			   const acc_t * _noalias const ev_global,
 			   const int eatom, const int vatom,
 			   const int out_offset, const int nall);

@ -176,8 +176,8 @@ class FixIntel : public Fix {
  int _im_real_space_task;
  MPI_Comm _real_space_comm;
  template <class ft, class acc_t>
-  inline void add_off_results(const ft * restrict const f_in,
-                              const acc_t * restrict const ev_global);
+  inline void add_off_results(const ft * _noalias const f_in,
+                              const acc_t * _noalias const ev_global);
  #endif
 };

@ -284,8 +284,8 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
 /* ---------------------------------------------------------------------- */

 template <class ft, class acc_t>
-void FixIntel::add_results(const ft * restrict const f_in,
-                           const acc_t * restrict const ev_global,
+void FixIntel::add_results(const ft * _noalias const f_in,
+                           const acc_t * _noalias const ev_global,
                           const int eatom, const int vatom,
 			   const int offload) {
  start_watch(TIME_PACK);
@ -295,7 +295,7 @@ void FixIntel::add_results(const ft * restrict const f_in,
    if (offload) {
      add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
      if (force->newton_pair) {
-	const acc_t * restrict const enull = 0;
+	const acc_t * _noalias const enull = 0;
 	int offset = _offload_nlocal;
 	if (atom->torque) offset *= 2;
 	add_oresults(f_in + offset, enull, eatom, vatom, 
@ -305,7 +305,7 @@ void FixIntel::add_results(const ft * restrict const f_in,
      add_oresults(f_in, ev_global, eatom, vatom,
 		   _host_min_local, _host_used_local);
      if (force->newton_pair) {
-	const acc_t * restrict const enull = 0;
+	const acc_t * _noalias const enull = 0;
 	int offset = _host_used_local;
 	if (atom->torque) offset *= 2;
 	add_oresults(f_in + offset, enull, eatom, 
@ -333,11 +333,11 @@ void FixIntel::add_results(const ft * restrict const f_in,
 /* ---------------------------------------------------------------------- */

 template <class ft, class acc_t>
-void FixIntel::add_oresults(const ft * restrict const f_in,
-			    const acc_t * restrict const ev_global,
+void FixIntel::add_oresults(const ft * _noalias const f_in,
+			    const acc_t * _noalias const ev_global,
 			    const int eatom, const int vatom,
 			    const int out_offset, const int nall) {
-  lmp_ft * restrict const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
+  lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
  if (atom->torque) {
    if (f_in[1].w)
      if (f_in[1].w == 1)
@ -356,7 +356,7 @@ void FixIntel::add_oresults(const ft * restrict const f_in,
    IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
    if (atom->torque) {
      int ii = ifrom * 2;
-      lmp_ft * restrict const tor = (lmp_ft *) lmp->atom->torque[0] +
+      lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
 	out_offset;
      if (eatom) {
        for (int i = ifrom; i < ito; i++) {
@ -464,8 +464,8 @@ void FixIntel::set_neighbor_host_sizes() {
 /* ---------------------------------------------------------------------- */

 template <class ft, class acc_t>
-void FixIntel::add_off_results(const ft * restrict const f_in,
-                               const acc_t * restrict const ev_global) {
+void FixIntel::add_off_results(const ft * _noalias const f_in,
+                               const acc_t * _noalias const ev_global) {
  if (_offload_balance < 0.0)
    _balance_other_time = MPI_Wtime() - _balance_other_time;

--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@ -22,8 +22,8 @@ using namespace LAMMPS_NS;

 template <class flt_t, class acc_t>
 IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
-    lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _buf_size(0),
-    _buf_local_size(0), _off_threads(0) {
+    lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _off_threads(0),
+    _buf_size(0), _buf_local_size(0) {
  _list_alloc_atoms = 0;
  _ntypes = 0;
  _off_map_maxlocal = 0;
@ -423,6 +423,8 @@ double IntelBuffers<flt_t, acc_t>::memory_usage(const int nthreads)
  tmem += _off_map_maxlocal * sizeof(int);
  tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int);
  tmem += _ntypes * _ntypes * sizeof(int);
+
+  return tmem;
 }

 /* ---------------------------------------------------------------------- */
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@ -266,8 +266,8 @@ class IntelBuffers {
  #endif
  
  int _buf_size, _buf_local_size;
-  __declspec(align(64)) acc_t _ev_global[8];
-  __declspec(align(64)) acc_t _ev_global_host[8];
+  _alignvar(acc_t _ev_global[8],64);
+  _alignvar(acc_t _ev_global_host[8],64);

  void _grow(const int nall, const int nlocal, const int nthreads,
 	     const int offload_end);
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@ -351,7 +351,7 @@ inline double MIC_Wtime() {
    for (int t = 1; t < nthreads; t++) {				\
      _Pragma("vector nontemporal")					\
      for (int n = iifrom; n < iito; n++) {				\
-        f_start[n].x += f_start[n + t_off].x;				\ 
+        f_start[n].x += f_start[n + t_off].x;				\
        f_start[n].y += f_start[n + t_off].y;				\
 	f_start[n].z += f_start[n + t_off].z;				\
 	f_start[n].w += f_start[n + t_off].w;				\
@ -361,7 +361,7 @@ inline double MIC_Wtime() {
  } else {								\
    for (int t = 1; t < nthreads; t++) {				\
      _Pragma("vector nontemporal")   					\
-      for (int n = iifrom; n < iito; n++) {                             \ 
+      for (int n = iifrom; n < iito; n++) {                             \
 	f_start[n].x += f_start[n + t_off].x;                  	        \
        f_start[n].y += f_start[n + t_off].y;				\
        f_start[n].z += f_start[n + t_off].z;				\
@ -372,7 +372,7 @@ inline double MIC_Wtime() {
 									\
  if (evflag) {								\
    if (vflag == 2) {							\
-      const ATOM_T * restrict const xo = x + minlocal;			\
+      const ATOM_T * _noalias const xo = x + minlocal;			\
      _Pragma("vector nontemporal")   					\
      for (int n = iifrom; n < iito; n++) {				\
 	ov0 += f_start[n].x * xo[n].x;					\
--- a/src/USER-INTEL/neigh_half_bin_intel.cpp
+++ b/src/USER-INTEL/neigh_half_bin_intel.cpp
@ -105,7 +105,7 @@ inline int mcoord2bin(const flt_t x0, const flt_t x1, const flt_t x2,

 template <class flt_t, class acc_t>
 void Neighbor::bin_atoms(void * xin) {
-  const ATOM_T * restrict const x = (const ATOM_T * restrict const)xin;
+  const ATOM_T * _noalias const x = (const ATOM_T * _noalias const)xin;
  int nlocal = atom->nlocal;
  const int nall = nlocal + atom->nghost;

@ -243,8 +243,8 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
    return;
  }

-  const ATOM_T * restrict const x = buffers->get_x();
-  int * restrict const firstneigh = buffers->firstneigh(list);
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);

  const int molecular = atom->molecular;
  int *ns = NULL, *s = NULL;
@ -260,17 +260,17 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
    tag_size = 0;
    special_size = 0;
  }
-  const int * restrict const special = s;
-  const int * restrict const nspecial = ns;
+  const int * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
  const int maxspecial = atom->maxspecial;
-  const int * restrict const tag = atom->tag;
+  const int * _noalias const tag = atom->tag;

-  int * restrict const ilist = list->ilist;
-  int * restrict numneigh = list->numneigh;
-  int * restrict const cnumneigh = buffers->cnumneigh(list);
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
  const int nstencil = list->nstencil;
-  const int * restrict const stencil = list->stencil;
-  const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
+  const int * _noalias const stencil = list->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
  const int ntypes = atom->ntypes + 1;
  const int nlocal = atom->nlocal;

@ -316,8 +316,8 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
  }

  #ifdef _LMP_INTEL_OFFLOAD
-  const int * restrict const binhead = this->binhead;
-  const int * restrict const special_flag = this->special_flag;
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const special_flag = this->special_flag;
  const int nbinx = this->nbinx;
  const int nbiny = this->nbiny;
  const int nbinz = this->nbinz;
@ -327,7 +327,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
  const int mbinx = this->mbinx;
  const int mbiny = this->mbiny;
  const int mbinz = this->mbinz;
-  const int * restrict const bins = this->bins;
+  const int * _noalias const bins = this->bins;
  const int cop = fix->coprocessor_number();
  const int separate_buffers = fix->separate_buffers();
  #pragma offload target(mic:cop) if(offload) \
@ -486,7 +486,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,

      if (molecular) {
        for (int i = ifrom; i < ito; ++i) {
-          int * restrict jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
          for (int jj = 0; jj < jnum; jj++) {
            const int j = jlist[jj];
@ -507,7 +507,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
      #ifdef _LMP_INTEL_OFFLOAD
      else if (separate_buffers) {
 	for (int i = ifrom; i < ito; ++i) {
-          int * restrict jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
 	  int jj = 0;
 	  for (jj = 0; jj < jnum; jj++)
@ -662,8 +662,8 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
    return;
  }

-  const ATOM_T * restrict const x = buffers->get_x();
-  int * restrict const firstneigh = buffers->firstneigh(list);
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);
  int nall_t = nall;
  if (offload_noghost && offload) nall_t = atom->nlocal;
  const int e_nall = nall_t;
@ -682,17 +682,17 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
    tag_size = 0;
    special_size = 0;
  }
-  const int * restrict const special = s;
-  const int * restrict const nspecial = ns;
+  const int * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
  const int maxspecial = atom->maxspecial;
-  const int * restrict const tag = atom->tag;
+  const int * _noalias const tag = atom->tag;

-  int * restrict const ilist = list->ilist;
-  int * restrict numneigh = list->numneigh;
-  int * restrict const cnumneigh = buffers->cnumneigh(list);
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
  const int nstencil = list->nstencil;
-  const int * restrict const stencil = list->stencil;
-  const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
+  const int * _noalias const stencil = list->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
  const int ntypes = atom->ntypes + 1;
  const int nlocal = atom->nlocal;

@ -737,8 +737,8 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
  }

  #ifdef _LMP_INTEL_OFFLOAD
-  const int * restrict const binhead = this->binhead;
-  const int * restrict const special_flag = this->special_flag;
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const special_flag = this->special_flag;
  const int nbinx = this->nbinx;
  const int nbiny = this->nbiny;
  const int nbinz = this->nbinz;
@ -748,7 +748,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
  const int mbinx = this->mbinx;
  const int mbiny = this->mbiny;
  const int mbinz = this->mbinz;
-  const int * restrict const bins = this->bins;
+  const int * _noalias const bins = this->bins;
  const int cop = fix->coprocessor_number();
  const int separate_buffers = fix->separate_buffers();
  #pragma offload target(mic:cop) if(offload) \
@ -948,7 +948,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,

      if (molecular) {
        for (int i = ifrom; i < ito; ++i) {
-          int * restrict jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
          for (int jj = 0; jj < jnum; jj++) {
            const int j = jlist[jj];
@ -970,7 +970,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
      #ifdef _LMP_INTEL_OFFLOAD
      else if (separate_buffers) {
 	for (int i = ifrom; i < ito; ++i) {
-          int * restrict jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
 	  int jj = 0;
 	  for (jj = 0; jj < jnum; jj++)
@ -1127,8 +1127,8 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
    return;
  }

-  const ATOM_T * restrict const x = buffers->get_x();
-  int * restrict const firstneigh = buffers->firstneigh(list);
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);
  int nall_t = nall;
  if (offload_noghost && offload) nall_t = atom->nlocal;
  const int e_nall = nall_t;
@ -1147,17 +1147,17 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
    tag_size = 0;
    special_size = 0;
  }
-  const int * restrict const special = s;
-  const int * restrict const nspecial = ns;
+  const int * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
  const int maxspecial = atom->maxspecial;
-  const int * restrict const tag = atom->tag;
+  const int * _noalias const tag = atom->tag;

-  int * restrict const ilist = list->ilist;
-  int * restrict numneigh = list->numneigh;
-  int * restrict const cnumneigh = buffers->cnumneigh(list);
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
  const int nstencil = list->nstencil;
-  const int * restrict const stencil = list->stencil;
-  const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
+  const int * _noalias const stencil = list->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
  const int ntypes = atom->ntypes + 1;
  const int nlocal = atom->nlocal;

@ -1202,8 +1202,8 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
  }

  #ifdef _LMP_INTEL_OFFLOAD
-  const int * restrict const binhead = this->binhead;
-  const int * restrict const special_flag = this->special_flag;
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const special_flag = this->special_flag;
  const int nbinx = this->nbinx;
  const int nbiny = this->nbiny;
  const int nbinz = this->nbinz;
@ -1213,7 +1213,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
  const int mbinx = this->mbinx;
  const int mbiny = this->mbiny;
  const int mbinz = this->mbinz;
-  const int * restrict const bins = this->bins;
+  const int * _noalias const bins = this->bins;
  const int cop = fix->coprocessor_number();
  const int separate_buffers = fix->separate_buffers();
  #pragma offload target(mic:cop) if(offload) \
@ -1386,7 +1386,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,

      if (molecular) {
        for (int i = ifrom; i < ito; ++i) {
-          int * restrict jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
          for (int jj = 0; jj < jnum; jj++) {
            const int j = jlist[jj];
@ -1407,7 +1407,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
      #ifdef _LMP_INTEL_OFFLOAD
      else if (separate_buffers) {
 	for (int i = ifrom; i < ito; ++i) {
-          int * restrict jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
 	  int jj = 0;
 	  for (jj = 0; jj < jnum; jj++)
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@ -79,7 +79,7 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
    fix->start_watch(TIME_PACK);
    const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
    const int * const ellipsoid = atom->ellipsoid;
-    QUAT_T * restrict const quat = buffers->get_quat();
+    QUAT_T * _noalias const quat = buffers->get_quat();
    #if defined(_OPENMP)
    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
    #endif
@ -150,8 +150,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
  fix->get_buffern(offload, nlocal, nall, minlocal);

  const int ago = neighbor->ago;
-  ATOM_T * restrict const x = buffers->get_x(offload);
-  QUAT_T * restrict const quat = buffers->get_quat(offload);
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  QUAT_T * _noalias const quat = buffers->get_quat(offload);
  const AtomVecEllipsoid::Bonus *bonus = avec->bonus;
  const int *ellipsoid = atom->ellipsoid;

@ -225,15 +225,15 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
  }									
  #endif

-  //  const int * restrict const ilist = list->ilist;
-  const int * restrict const numneigh = list->numneigh;
-  const int * restrict const cnumneigh = buffers->cnumneigh(list);
-  const int * restrict const firstneigh = buffers->firstneigh(list);
-  const flt_t * restrict const special_lj = fc.special_lj;
+  //  const int * _noalias const ilist = list->ilist;
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const flt_t * _noalias const special_lj = fc.special_lj;

-  const FC_PACKED1_T * restrict const ijc = fc.ijc[0];
-  const FC_PACKED2_T * restrict const lj34 = fc.lj34[0];
-  const FC_PACKED3_T * restrict const ic = fc.ic;
+  const FC_PACKED1_T * _noalias const ijc = fc.ijc[0];
+  const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
+  const FC_PACKED3_T * _noalias const ic = fc.ic;
  const flt_t mu = fc.mu;
  const flt_t gamma = fc.gamma;
  const flt_t upsilon = fc.upsilon;
@ -255,8 +255,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 		       x_size, q_size, ev_size, f_stride);

  int tc;
-  FORCE_T * restrict f_start;
-  acc_t * restrict ev_global;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
  const int max_nbors = _max_nbors;
  const int nthreads = tc;
@ -351,25 +351,25 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
      iifrom += astart;
      iito += astart;

-      FORCE_T * restrict const f = f_start - minlocal * 2 + (tid * f_stride);
+      FORCE_T * _noalias const f = f_start - minlocal * 2 + (tid * f_stride);
      memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));

-      flt_t * restrict const rsq_form = rsq_formi + tid * max_nbors;
-      flt_t * restrict const delx_form = delx_formi + tid * max_nbors;
-      flt_t * restrict const dely_form = dely_formi + tid * max_nbors;
-      flt_t * restrict const delz_form = delz_formi + tid * max_nbors;
-      int * restrict const jtype_form = jtype_formi + tid * max_nbors;
-      int * restrict const jlist_form = jlist_formi + tid * max_nbors;
+      flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
+      flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
+      flt_t * _noalias const dely_form = dely_formi + tid * max_nbors;
+      flt_t * _noalias const delz_form = delz_formi + tid * max_nbors;
+      int * _noalias const jtype_form = jtype_formi + tid * max_nbors;
+      int * _noalias const jlist_form = jlist_formi + tid * max_nbors;

      int ierror = 0;
      for (int i = iifrom; i < iito; ++i) {
        // const int i = ilist[ii];
        const int itype = x[i].w;
        const int ptr_off = itype * ntypes;
-        const FC_PACKED1_T * restrict const ijci = ijc + ptr_off;
-        const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off;
+        const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
+        const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;

-        const int * restrict const jlist = firstneigh + cnumneigh[i];
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
        const int jnum = numneigh[i];

        const flt_t xtmp = x[i].x;
@ -819,7 +819,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,

      if (EVFLAG) {
        if (vflag==2) {
-          const ATOM_T * restrict const xo = x + minlocal;
+          const ATOM_T * _noalias const xo = x + minlocal;
 	  #pragma vector nontemporal
          for (int n = iifrom; n < iito; n++) {
            const int nt2 = n * 2;
--- a/src/USER-INTEL/pair_gayberne_intel.h
+++ b/src/USER-INTEL/pair_gayberne_intel.h
@ -62,7 +62,10 @@ class PairGayBerneIntel : public PairGayBerne {
    typedef struct { flt_t lj3, lj4; } fc_packed2;
    typedef struct { flt_t shape2[4], well[4]; } fc_packed3;

-    __declspec(align(64)) flt_t special_lj[4], gamma, upsilon, mu;
+    _alignvar(flt_t special_lj[4],64);
+    _alignvar(flt_t gamma,64);
+    _alignvar(flt_t upsilon,64);
+    _alignvar(flt_t mu,64);
    fc_packed1 **ijc;
    fc_packed2 **lj34;
    fc_packed3 *ic;
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@ -143,25 +143,25 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
  const int ago = neighbor->ago;
  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);

-  ATOM_T * restrict const x = buffers->get_x(offload);
-  flt_t * restrict const q = buffers->get_q(offload);
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  flt_t * _noalias const q = buffers->get_q(offload);

-  const int * restrict const numneigh = list->numneigh;
-  const int * restrict const cnumneigh = buffers->cnumneigh(list);
-  const int * restrict const firstneigh = buffers->firstneigh(list);
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);

-  const flt_t * restrict const special_coul = fc.special_coul;
-  const flt_t * restrict const special_lj = fc.special_lj;
+  const flt_t * _noalias const special_coul = fc.special_coul;
+  const flt_t * _noalias const special_lj = fc.special_lj;
  const flt_t qqrd2e = force->qqrd2e;
  const flt_t inv_denom_lj = (flt_t)1.0/denom_lj;

-  const flt_t * restrict const cutsq = fc.cutsq[0];
-  const LJ_T * restrict const lj = fc.lj[0];
-  const TABLE_T * restrict const table = fc.table;
-  const flt_t * restrict const etable = fc.etable;
-  const flt_t * restrict const detable = fc.detable;
-  const flt_t * restrict const ctable = fc.ctable;
-  const flt_t * restrict const dctable = fc.dctable;
+  const flt_t * _noalias const cutsq = fc.cutsq[0];
+  const LJ_T * _noalias const lj = fc.lj[0];
+  const TABLE_T * _noalias const table = fc.table;
+  const flt_t * _noalias const etable = fc.etable;
+  const flt_t * _noalias const detable = fc.detable;
+  const flt_t * _noalias const ctable = fc.ctable;
+  const flt_t * _noalias const dctable = fc.dctable;
  const flt_t cut_ljsq = fc.cut_ljsq;
  const flt_t cut_lj_innersq = fc.cut_lj_innersq;
  const flt_t cut_coulsq = fc.cut_coulsq;
@ -178,8 +178,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
 		       x_size, q_size, ev_size, f_stride);
 		       
  int tc;
-  FORCE_T * restrict f_start;
-  acc_t * restrict ev_global;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);

  const int nthreads = tc;
@ -242,7 +242,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
      iifrom += astart;
      iito += astart;

-      FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
+      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
      flt_t cutboth = cut_coulsq;

@ -251,10 +251,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
        const int itype = x[i].w;

        const int ptr_off = itype * ntypes;
-        const flt_t * restrict const cutsqi = cutsq + ptr_off;
-        const LJ_T * restrict const lji = lj + ptr_off;
+        const flt_t * _noalias const cutsqi = cutsq + ptr_off;
+        const LJ_T * _noalias const lji = lj + ptr_off;

-        const int   * restrict const jlist = firstneigh + cnumneigh[i];
+        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
        const int jnum = numneigh[i];

        acc_t fxtmp,fytmp,fztmp,fwtmp;
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
@ -62,8 +62,8 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
  class ForceConst {
   public:
    typedef struct { flt_t r, dr, f, df; } table_t;
-    __declspec(align(64)) flt_t special_coul[4];
-    __declspec(align(64)) flt_t special_lj[4];
+    _alignvar(flt_t special_coul[4],64);
+    _alignvar(flt_t special_lj[4],64);
    flt_t **cutsq, g_ewald, tabinnersq;
    flt_t cut_coulsq, cut_ljsq;
    flt_t cut_lj_innersq;
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@ -143,24 +143,24 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
  const int ago = neighbor->ago;
  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);

-  ATOM_T * restrict const x = buffers->get_x(offload);
-  flt_t * restrict const q = buffers->get_q(offload);
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  flt_t * _noalias const q = buffers->get_q(offload);

-  const int * restrict const numneigh = list->numneigh;
-  const int * restrict const cnumneigh = buffers->cnumneigh(list);
-  const int * restrict const firstneigh = buffers->firstneigh(list);
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);

-  const flt_t * restrict const special_coul = fc.special_coul;
-  const flt_t * restrict const special_lj = fc.special_lj;
+  const flt_t * _noalias const special_coul = fc.special_coul;
+  const flt_t * _noalias const special_lj = fc.special_lj;
  const flt_t qqrd2e = force->qqrd2e;

-  const C_FORCE_T * restrict const c_force = fc.c_force[0];
-  const C_ENERGY_T * restrict const c_energy = fc.c_energy[0];
-  const TABLE_T * restrict const table = fc.table;
-  const flt_t * restrict const etable = fc.etable;
-  const flt_t * restrict const detable = fc.detable;
-  const flt_t * restrict const ctable = fc.ctable;
-  const flt_t * restrict const dctable = fc.dctable;
+  const C_FORCE_T * _noalias const c_force = fc.c_force[0];
+  const C_ENERGY_T * _noalias const c_energy = fc.c_energy[0];
+  const TABLE_T * _noalias const table = fc.table;
+  const flt_t * _noalias const etable = fc.etable;
+  const flt_t * _noalias const detable = fc.detable;
+  const flt_t * _noalias const ctable = fc.ctable;
+  const flt_t * _noalias const dctable = fc.dctable;
  const flt_t g_ewald = fc.g_ewald;
  const flt_t tabinnersq = fc.tabinnersq;

@ -174,8 +174,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 		       x_size, q_size, ev_size, f_stride);

  int tc;
-  FORCE_T * restrict f_start;
-  acc_t * restrict ev_global;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);

  const int nthreads = tc;
@ -237,17 +237,17 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
      iifrom += astart;
      iito += astart;

-      FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
+      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));

      for (int i = iifrom; i < iito; ++i) {
        const int itype = x[i].w;

        const int ptr_off = itype * ntypes;
-        const C_FORCE_T * restrict const c_forcei = c_force + ptr_off;
-        const C_ENERGY_T * restrict const c_energyi = c_energy + ptr_off;
+        const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
+        const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;

-        const int   * restrict const jlist = firstneigh + cnumneigh[i];
+        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
        const int jnum = numneigh[i];

        acc_t fxtmp,fytmp,fztmp,fwtmp;
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
@ -64,8 +64,8 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
    typedef struct { flt_t cutsq, cut_ljsq, lj1, lj2; } c_force_t;
    typedef struct { flt_t lj3, lj4, offset, pad; } c_energy_t;
    typedef struct { flt_t r, dr, f, df; } table_t;
-    __declspec(align(64)) flt_t special_coul[4];
-    __declspec(align(64)) flt_t special_lj[4];
+    _alignvar(flt_t special_coul[4],64);
+    _alignvar(flt_t special_lj[4],64);
    flt_t g_ewald, tabinnersq;
    c_force_t **c_force;
    c_energy_t **c_energy;
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@ -134,14 +134,14 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
  const int ago = neighbor->ago;
  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);

-  ATOM_T * restrict const x = buffers->get_x(offload);
+  ATOM_T * _noalias const x = buffers->get_x(offload);

-  const int * restrict const numneigh = list->numneigh;
-  const int * restrict const cnumneigh = buffers->cnumneigh(list);
-  const int * restrict const firstneigh = buffers->firstneigh(list);
-  const flt_t * restrict const special_lj = fc.special_lj;
-  const FC_PACKED1_T * restrict const ljc12o = fc.ljc12o[0];
-  const FC_PACKED2_T * restrict const lj34 = fc.lj34[0];
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const flt_t * _noalias const special_lj = fc.special_lj;
+  const FC_PACKED1_T * _noalias const ljc12o = fc.ljc12o[0];
+  const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];

  const int ntypes = atom->ntypes + 1;
  const int eatom = this->eflag_atom;
@ -153,8 +153,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 		       x_size, q_size, ev_size, f_stride);

  int tc;
-  FORCE_T * restrict f_start;
-  acc_t * restrict ev_global;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
  const int nthreads = tc;
  int *overflow = fix->get_off_overflow_flag();
@ -184,17 +184,17 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
      iifrom += astart;
      iito += astart;

-      FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
+      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));

      for (int i = iifrom; i < iito; ++i) {
        const int itype = x[i].w;

        const int ptr_off = itype * ntypes;
-        const FC_PACKED1_T * restrict const ljc12oi = ljc12o + ptr_off;
-        const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off;
+        const FC_PACKED1_T * _noalias const ljc12oi = ljc12o + ptr_off;
+        const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;

-        const int * restrict const jlist = firstneigh + cnumneigh[i];
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
        const int jnum = numneigh[i];

        acc_t fxtmp, fytmp, fztmp, fwtmp;
--- a/src/USER-INTEL/pair_lj_cut_intel.h
+++ b/src/USER-INTEL/pair_lj_cut_intel.h
@ -62,7 +62,7 @@ class PairLJCutIntel : public PairLJCut {
    typedef struct { flt_t cutsq, lj1, lj2, offset; } fc_packed1;
    typedef struct { flt_t lj3, lj4; } fc_packed2;

-    __declspec(align(64)) flt_t special_lj[4];
+    _alignvar(flt_t special_lj[4],64);
    fc_packed1 **ljc12o;
    fc_packed2 **lj34;

--- a/src/USER-INTEL/verlet_intel.cpp
+++ b/src/USER-INTEL/verlet_intel.cpp
@ -17,6 +17,7 @@
 #include "domain.h"
 #include "comm.h"
 #include "atom.h"
+#include "atom_vec.h"
 #include "force.h"
 #include "pair.h"
 #include "bond.h"
@ -81,14 +82,9 @@ void VerletIntel::init()
  // set flags for what arrays to clear in force_clear()
  // need to clear additionals arrays if they exist

-  torqueflag = 0;
+  torqueflag = extraflag = 0;
  if (atom->torque_flag) torqueflag = 1;
-  erforceflag = 0;
-  if (atom->erforce_flag) erforceflag = 1;
-  e_flag = 0;
-  if (atom->e_flag) e_flag = 1;
-  rho_flag = 0;
-  if (atom->rho_flag) rho_flag = 1;
+  if (atom->avec->forceclearflag) extraflag = 1;

  // orthogonal vs triclinic simulation box

@ -276,8 +272,10 @@ void VerletIntel::run(int n)

    // initial time integration

+    timer->stamp();
    modify->initial_integrate(vflag);
    if (n_post_integrate) modify->post_integrate();
+    timer->stamp(Timer::MODIFY);

    // regular communication vs neighbor list rebuild

@ -286,9 +284,13 @@ void VerletIntel::run(int n)
    if (nflag == 0) {
      timer->stamp();
      comm->forward_comm();
-      timer->stamp(TIME_COMM);
+      timer->stamp(Timer::COMM);
    } else {
-      if (n_pre_exchange) modify->pre_exchange();
+      if (n_pre_exchange) {
+        timer->stamp();
+        modify->pre_exchange();
+        timer->stamp(Timer::MODIFY);
+      }
      if (triclinic) domain->x2lamda(atom->nlocal);
      domain->pbc();
      if (domain->box_change) {
@ -301,10 +303,13 @@ void VerletIntel::run(int n)
      if (sortflag && ntimestep >= atom->nextsort) atom->sort();
      comm->borders();
      if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
-      timer->stamp(TIME_COMM);
-      if (n_pre_neighbor) modify->pre_neighbor();
+      timer->stamp(Timer::COMM);
+      if (n_pre_neighbor) {
+        modify->pre_neighbor();
+        timer->stamp(Timer::MODIFY);
+      }
      neighbor->build();
-      timer->stamp(TIME_NEIGHBOR);
+      timer->stamp(Timer::NEIGH);
    }

    // force computations
@ -313,13 +318,18 @@ void VerletIntel::run(int n)
    // and Pair:ev_tally() needs to be called before any tallying

    force_clear();
-    if (n_pre_force) modify->pre_force(vflag);

    timer->stamp();

+    if (n_pre_force) {
+      modify->pre_force(vflag);
+      timer->stamp(Timer::MODIFY);
+    }
+
+
    if (pair_compute_flag) {
      force->pair->compute(eflag,vflag);
-      timer->stamp(TIME_PAIR);
+      timer->stamp(Timer::PAIR);
    }

    if (atom->molecular) {
@ -327,18 +337,18 @@ void VerletIntel::run(int n)
      if (force->angle) force->angle->compute(eflag,vflag);
      if (force->dihedral) force->dihedral->compute(eflag,vflag);
      if (force->improper) force->improper->compute(eflag,vflag);
-      timer->stamp(TIME_BOND);
+      timer->stamp(Timer::BOND);
    }

    if (kspace_compute_flag) {
      force->kspace->compute(eflag,vflag);
-      timer->stamp(TIME_KSPACE);
+      timer->stamp(Timer::KSPACE);
    }

    #ifdef _LMP_INTEL_OFFLOAD
    if (sync_mode == 1) {
      fix_intel->sync_coprocessor();
-      timer->stamp(TIME_PAIR);
+      timer->stamp(Timer::PAIR);
    }
    #endif

@ -346,13 +356,13 @@ void VerletIntel::run(int n)

    if (force->newton) {
      comm->reverse_comm();
-      timer->stamp(TIME_COMM);
+      timer->stamp(Timer::COMM);
    }

    #ifdef _LMP_INTEL_OFFLOAD
    if (sync_mode == 2) {
      fix_intel->sync_coprocessor();
-      timer->stamp(TIME_PAIR);
+      timer->stamp(Timer::PAIR);
    }
    #endif

@ -361,13 +371,14 @@ void VerletIntel::run(int n)
    if (n_post_force) modify->post_force(vflag);
    modify->final_integrate();
    if (n_end_of_step) modify->end_of_step();
+    timer->stamp(Timer::MODIFY);

    // all output

    if (ntimestep == output->next) {
      timer->stamp();
      output->write(ntimestep);
-      timer->stamp(TIME_OUTPUT);
+      timer->stamp(Timer::OUTPUT);
    }
  }
 }
@ -388,7 +399,7 @@ void VerletIntel::cleanup()

 void VerletIntel::force_clear()
 {
-  int i;
+  size_t nbytes;

  if (external_force_clear) return;

@ -396,19 +407,16 @@ void VerletIntel::force_clear()
  // if either newton flag is set, also include ghosts
  // when using threads always clear all forces.

-  if (neighbor->includegroup == 0) {
-    int nall;
-    if (force->newton) nall = atom->nlocal + atom->nghost;
-    else nall = atom->nlocal;
+  int nlocal = atom->nlocal;

-    size_t nbytes = sizeof(double) * nall;
+  if (neighbor->includegroup == 0) {
+    nbytes = sizeof(double) * nlocal;
+    if (force->newton) nbytes += sizeof(double) * atom->nghost;

    if (nbytes) {
-      memset(&(atom->f[0][0]),0,3*nbytes);
-      if (torqueflag)  memset(&(atom->torque[0][0]),0,3*nbytes);
-      if (erforceflag) memset(&(atom->erforce[0]),  0,  nbytes);
-      if (e_flag)      memset(&(atom->de[0]),       0,  nbytes);
-      if (rho_flag)    memset(&(atom->drho[0]),     0,  nbytes);
+      memset(&atom->f[0][0],0,3*nbytes);
+      if (torqueflag) memset(&atom->torque[0][0],0,3*nbytes);
+      if (extraflag) atom->avec->force_clear(0,nbytes);
    }

  // neighbor includegroup flag is set
@ -416,70 +424,21 @@ void VerletIntel::force_clear()
  // if either newton flag is set, also include ghosts

  } else {
-    int nall = atom->nfirst;
+    nbytes = sizeof(double) * atom->nfirst;

-    double **f = atom->f;
-    for (i = 0; i < nall; i++) {
-      f[i][0] = 0.0;
-      f[i][1] = 0.0;
-      f[i][2] = 0.0;
-    }
-
-    if (torqueflag) {
-      double **torque = atom->torque;
-      for (i = 0; i < nall; i++) {
-        torque[i][0] = 0.0;
-        torque[i][1] = 0.0;
-        torque[i][2] = 0.0;
-      }
-    }
-
-    if (erforceflag) {
-      double *erforce = atom->erforce;
-      for (i = 0; i < nall; i++) erforce[i] = 0.0;
-    }
-
-    if (e_flag) {
-      double *de = atom->de;
-      for (i = 0; i < nall; i++) de[i] = 0.0;
-    }
-
-    if (rho_flag) {
-      double *drho = atom->drho;
-      for (i = 0; i < nall; i++) drho[i] = 0.0;
+    if (nbytes) {
+      memset(&atom->f[0][0],0,3*nbytes);
+      if (torqueflag) memset(&atom->torque[0][0],0,3*nbytes);
+      if (extraflag) atom->avec->force_clear(0,nbytes);
    }

    if (force->newton) {
-      nall = atom->nlocal + atom->nghost;
+      nbytes = sizeof(double) * atom->nghost;

-      for (i = atom->nlocal; i < nall; i++) {
-        f[i][0] = 0.0;
-        f[i][1] = 0.0;
-        f[i][2] = 0.0;
-      }
-
-      if (torqueflag) {
-        double **torque = atom->torque;
-        for (i = atom->nlocal; i < nall; i++) {
-          torque[i][0] = 0.0;
-          torque[i][1] = 0.0;
-          torque[i][2] = 0.0;
-        }
-      }
-
-      if (erforceflag) {
-        double *erforce = atom->erforce;
-        for (i = atom->nlocal; i < nall; i++) erforce[i] = 0.0;
-      }
-
-      if (e_flag) {
-        double *de = atom->de;
-        for (i = 0; i < nall; i++) de[i] = 0.0;
-      }
-
-      if (rho_flag) {
-        double *drho = atom->drho;
-        for (i = 0; i < nall; i++) drho[i] = 0.0;
+      if (nbytes) {
+        memset(&atom->f[nlocal][0],0,3*nbytes);
+        if (torqueflag) memset(&atom->torque[nlocal][0],0,3*nbytes);
+        if (extraflag) atom->avec->force_clear(nlocal,nbytes);
      }
    }
  }
--- a/src/USER-INTEL/verlet_intel.h
+++ b/src/USER-INTEL/verlet_intel.h
@ -39,8 +39,7 @@ class VerletIntel : public Integrate {

 protected:
  int triclinic;                    // 0 if domain is orthog, 1 if triclinic
-  int torqueflag,erforceflag;
-  int e_flag,rho_flag;
+  int torqueflag,extraflag;

  virtual void force_clear();
  #ifdef _LMP_INTEL_OFFLOAD
--- a/src/USER-INTEL/verlet_split_intel.cpp
+++ b/src/USER-INTEL/verlet_split_intel.cpp
@ -52,6 +52,9 @@ VerletSplitIntel::VerletSplitIntel(LAMMPS *lmp, int narg, char **arg) :
  if (universe->procs_per_world[0] % universe->procs_per_world[1])
    error->universe_all(FLERR,"Verlet/split requires Rspace partition "
                        "size be multiple of Kspace partition size");
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"Verlet/split can only currently be used with "
+                        "comm_style brick");

  // master = 1 for Rspace procs, 0 for Kspace procs

@ -214,6 +217,9 @@ VerletSplitIntel::~VerletSplitIntel()

 void VerletSplitIntel::init()
 {
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"Verlet/split can only currently be used with "
+                        "comm_style brick");
  if (!force->kspace && comm->me == 0)
    error->warning(FLERR,"No Kspace calculation with verlet/split");

@ -277,7 +283,7 @@ void VerletSplitIntel::run(int n)

  MPI_Barrier(universe->uworld);
  timer->init();
-  timer->barrier_start(TIME_LOOP);
+  timer->barrier_start();

  // setup initial Rspace <-> Kspace comm params

@ -323,7 +329,7 @@ void VerletSplitIntel::run(int n)
      if (nflag == 0) {
        timer->stamp();
        comm->forward_comm();
-        timer->stamp(TIME_COMM);
+        timer->stamp(Timer::COMM);
      } else {
        if (n_pre_exchange) modify->pre_exchange();
        if (triclinic) domain->x2lamda(atom->nlocal);
@ -338,10 +344,10 @@ void VerletSplitIntel::run(int n)
        if (sortflag && ntimestep >= atom->nextsort) atom->sort();
        comm->borders();
        if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
-        timer->stamp(TIME_COMM);
+        timer->stamp(Timer::COMM);
        if (n_pre_neighbor) modify->pre_neighbor();
        neighbor->build();
-        timer->stamp(TIME_NEIGHBOR);
+        timer->stamp(Timer::NEIGH);
      }
    }

@ -361,7 +367,7 @@ void VerletSplitIntel::run(int n)
      timer->stamp();
      if (force->pair) {
        force->pair->compute(eflag,vflag);
-        timer->stamp(TIME_PAIR);
+        timer->stamp(Timer::PAIR);
      }

      if (atom->molecular) {
@ -369,25 +375,25 @@ void VerletSplitIntel::run(int n)
        if (force->angle) force->angle->compute(eflag,vflag);
        if (force->dihedral) force->dihedral->compute(eflag,vflag);
        if (force->improper) force->improper->compute(eflag,vflag);
-        timer->stamp(TIME_BOND);
+        timer->stamp(Timer::BOND);
      }

      #ifdef _LMP_INTEL_OFFLOAD
      if (sync_mode == 1) {
 	fix_intel->sync_coprocessor();
-	timer->stamp(TIME_PAIR);
+	timer->stamp(Timer::PAIR);
      }
      #endif

      if (force->newton) {
        comm->reverse_comm();
-        timer->stamp(TIME_COMM);
+        timer->stamp(Timer::COMM);
      }

      #ifdef _LMP_INTEL_OFFLOAD
      if (sync_mode == 2) {
 	fix_intel->sync_coprocessor();
-	timer->stamp(TIME_PAIR);
+	timer->stamp(Timer::PAIR);
      }
      #endif

@ -400,14 +406,14 @@ void VerletSplitIntel::run(int n)
      if (force->kspace) {
        timer->stamp();
        force->kspace->compute(eflag,vflag);
-        timer->stamp(TIME_KSPACE);
+        timer->stamp(Timer::KSPACE);
      }

      // TIP4P PPPM puts forces on ghost atoms, so must reverse_comm()

      if (tip4p_flag && force->newton) {
        comm->reverse_comm();
-        timer->stamp(TIME_COMM);
+        timer->stamp(Timer::COMM);
      }
    }

@ -419,14 +425,16 @@ void VerletSplitIntel::run(int n)
    // all output

    if (master) {
+      timer->stamp();
      if (n_post_force) modify->post_force(vflag);
      modify->final_integrate();
      if (n_end_of_step) modify->end_of_step();
+      timer->stamp(Timer::MODIFY);

      if (ntimestep == output->next) {
        timer->stamp();
        output->write(ntimestep);
-        timer->stamp(TIME_OUTPUT);
+        timer->stamp(Timer::OUTPUT);
      }
    }
  }
@ -498,7 +506,7 @@ void VerletSplitIntel::rk_setup()
      atom->map_clear();
      comm->borders();
      if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
-      timer->stamp(TIME_COMM);
+      timer->stamp(Timer::COMM);
    }
  }
 }
@ -549,7 +557,7 @@ void VerletSplitIntel::r2k_comm()
  if (tip4p_flag && !master) {
    timer->stamp();
    comm->forward_comm();
-    timer->stamp(TIME_COMM);
+    timer->stamp(Timer::COMM);
  }
 }

--- a/src/input.cpp
+++ b/src/input.cpp
@ -1442,30 +1442,6 @@ void Input::package()
    delete [] fixarg;

 } else if (strcmp(arg[0],"intel") == 0) {
-
-    // add omp package for non-pair routines
-
-    /*
-    char **fixarg = new char*[2+narg];
-    fixarg[0] = (char *) "package_omp";
-    fixarg[1] = (char *) "all";
-    fixarg[2] = (char *) "OMP";
-    int omp_narg = 3;
-    if (narg > 1) {
-      fixarg[3] = arg[1];
-      omp_narg++;
-      if (narg > 2)
-	for (int i = 2; i < narg; i++)
-	  if (strcmp(arg[i],"mixed") == 0) {
-	    fixarg[4] = arg[i];
-	    omp_narg++;
-	  }
-    }
-    modify->add_fix(omp_narg,fixarg);
-
-    // add intel package for neighbor and pair routines
-    */
-
    if (!modify->check_package("Intel"))
      error->all(FLERR,
                 "Package intel command without USER-INTEL package installed");
@ -1478,17 +1454,12 @@ void Input::package()
    modify->add_fix(2+narg,fixarg);
    delete [] fixarg;

-    /*
-    // if running with offload, set run_style to verlet/intel
+    // set integrator = verlet/intel
+    // -sf intel does same thing in Update constructor via suffix

-    #ifdef LMP_INTEL_OFFLOAD
-    #ifdef __INTEL_OFFLOAD
    char *str;
    str = (char *) "verlet/intel";
    update->create_integrate(1,&str,0);
-    #endif
-    #endif
-    */

  } else error->all(FLERR,"Illegal package command");
 }
--- a/src/integrate.cpp
+++ b/src/integrate.cpp
@ -53,6 +53,15 @@ void Integrate::init()
  else pair_compute_flag = 0;
  if (force->kspace && force->kspace->compute_flag) kspace_compute_flag = 1;
  else kspace_compute_flag = 0;
+
+  // should add checks:
+  // for any acceleration package that has its own integrate/minimize
+  // in case input script has reset the run or minimize style explicitly
+  // e.g. invalid to have intel pair style with non-intel verlet
+  // but OK to have intel verlet with non intel pair style (just warn)
+  // ditto for USER-CUDA and KOKKOS package verlet with their pair, fix, etc
+  // making these checks would require all the pair, fix, etc styles have
+  //   cuda, kokkos, intel flags
 }

 /* ----------------------------------------------------------------------
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@ -667,6 +667,8 @@ void LAMMPS::post_create()
 {
  if (!suffix_enable) return;

+  // suffix will always be set if suffix_enable = 1
+
  if (strcmp(suffix,"gpu") == 0 && !modify->check_package("GPU"))
    error->all(FLERR,"Using suffix gpu without GPU package installed");
  if (strcmp(suffix,"intel") == 0 && !modify->check_package("Intel"))
@ -674,7 +676,10 @@ void LAMMPS::post_create()
  if (strcmp(suffix,"omp") == 0 && !modify->check_package("OMP"))
    error->all(FLERR,"Using suffix omp without USER-OMP package installed");

-  if (strcmp(suffix2,"omp") == 0 && !modify->check_package("OMP")) {
+  // suffix2 only currently set by -sf intel
+  // need to unset if LAMMPS was not built with USER-OMP package
+
+  if (suffix2 && strcmp(suffix2,"omp") == 0 && !modify->check_package("OMP")) {
    delete [] suffix2;
    suffix2 = NULL;
  }
--- a/src/lammps.h
+++ b/src/lammps.h
@ -47,9 +47,6 @@ class LAMMPS {
  int cite_enable;               // 1 if generating log.cite, 0 if disabled

  class Cuda *cuda;              // CUDA accelerator class
-  //class GPU *gpu;                // GPU accelerator class
-  //class Intel *intel;            // Intel accelerator class
-  //class OMP *omp;                // OMP accelerator class
  class KokkosLMP *kokkos;       // KOKKOS accelerator class

  class CiteMe *citeme;          // citation info
--- a/src/lmptype.h
+++ b/src/lmptype.h
@ -167,6 +167,33 @@ typedef int bigint;

 }

+// preprocessor macros for compiler specific settings
+// clear previous definitions to avoid redefinition warning
+#ifdef _alignvar
+#undef _alignvar
+#endif
+#ifdef _noalias
+#undef _noalias
+#endif
+
+// define stack variable alignment
+#if defined(__INTEL_COMPILER)
+#define _alignvar(expr,val) __declspec(align(val)) expr
+#elif defined(__GNUC__)
+#define _alignvar(expr,val) expr __attribute((aligned(val)))
+#else
+#define _alignvar(expr,val) expr
+#endif
+
+// declaration to lift aliasing restrictions
+#if defined(__INTEL_COMPILER)
+#define _noalias restrict
+#elif defined(__GNUC__)
+#define _noalias __restrict
+#else
+#define _noalias
+#endif
+
 // settings to enable LAMMPS to build under Windows

 #ifdef _WIN32
--- a/src/version.h
+++ b/src/version.h
@ -1 +1 @@
-#define LAMMPS_VERSION "14 Aug 2014"
+#define LAMMPS_VERSION "15 Aug 2014"