Merge branch 'develop' into pair_style_tutorial

2023-03-14 10:10:52 -04:00
parent 8353c8336c e4ad330c2b
commit 5dc9a570fa
201 changed files with 4292 additions and 3247 deletions
--- a/doc/src/bond_fene.rst
+++ b/doc/src/bond_fene.rst
@ -51,7 +51,7 @@ in the same form as in pair style :doc:`nm/cut <pair_nm>`. The bond energy is th

 .. math::

-  E = -0.5 K r_0^2  \ln \left[ 1 - \left(\frac{r}{R_0}\right)^2\right] + \frac{E_0}{(n-m)} \left[ m \left(\frac{r_0}{r}\right)^n - n \left(\frac{r_0}{r}\right)^m \right]
+  E = -0.5 K R_0^2  \ln \left[ 1 - \left(\frac{r}{R_0}\right)^2\right] + \frac{E_0}{(n-m)} \left[ m \left(\frac{r_0}{r}\right)^n - n \left(\frac{r_0}{r}\right)^m \right]

 Similar to the *fene* style, the generalized Lennard-Jones is cut off at
 the potential minimum, :math:`r_0`, to be repulsive only.  The following
--- a/doc/src/fix_polarize.rst
+++ b/doc/src/fix_polarize.rst
@ -31,7 +31,7 @@ Examples

   fix 2 interface polarize/bem/gmres 5 0.0001
   fix 1 interface polarize/bem/icc 1 0.0001
-   fix 3 interface polarize/functional 1 0.001
+   fix 3 interface polarize/functional 1 0.0001


 Used in input scripts:
@ -69,8 +69,9 @@ along the normal vector is then 78 - 4 = 74, the mean dielectric value
 is (78 + 4) / 2 = 41. Each boundary element also has its area and the
 local mean curvature, which is used by these fixes for computing a
 correction term in the local electric field.  To model charged
-interfaces, the interface particle will have a non-zero charge value,
-coming from its area and surface charge density.
+interfaces, an interface particle will have a non-zero charge value,
+coming from its area and surface charge density, and its local dielectric
+constant set to the mean dielectric value.

 For non-interface particles such as atoms and charged particles, the
 interface normal vectors, element area, and dielectric mismatch are
@ -211,6 +212,8 @@ Note that the *polarize/bem/gmres* and *polarize/bem/icc* fixes only
 support :doc:`units <units>` *lj*, *real*, *metal*, *si* and *nano* at
 the moment.

+Note that *polarize/functional* does not yet support charged interfaces.
+

 Related commands
 """"""""""""""""
@ -223,7 +226,7 @@ Related commands
 Default
 """""""

-*iter_max* = 20
+*iter_max* = 50

 *kspace* = yes

--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@ -319,7 +319,7 @@ CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH,
 THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
 BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
 BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
-PPPM_MAX_SPLINE.
+PPPM_MAX_SPLINE, NBOR_PREFETCH.

 CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX
 (NVIDIA) or OpenCL extensions (Intel) should be used for horizontal
--- a/examples/PACKAGES/dielectric/data.sphere
+++ b/examples/PACKAGES/dielectric/data.sphere
--- a/examples/PACKAGES/dielectric/in.confined
+++ b/examples/PACKAGES/dielectric/in.confined
@ -7,7 +7,7 @@
 # Dielectric constants can be set to be different from the input data file

 variable epsilon1 index 20
-variable epsilon2 index 10 
+variable epsilon2 index 10

 variable    data index data.confined

@ -53,11 +53,11 @@ kspace_modify   slab 3.0
 neigh_modify    every 1 delay 0 check yes one 5000

 #compute         ef all efield/atom
-dump            1 all custom 100 all.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
-dump            2 interface custom 100 interface.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
-dump_modify     1 sort id
+#dump            1 all custom 100 all.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump            2 interface custom 100 interface.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id

-dump            3 ions custom 100 ions.dump id mol type q x y z fx fy fz #c_ef[1] c_ef[2] c_ef[3]
+#dump            3 ions custom 100 ions.dump id mol type q x y z fx fy fz #c_ef[1] c_ef[2] c_ef[3]

 fix             1 ions nve

@ -67,10 +67,10 @@ if "${method} == gmres" then &
  "fix             3 interface polarize/bem/gmres 1 1.0e-4" &
  "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" &
 elif "${method} == icc"&
-  "fix             3 interface polarize/bem/icc 1 1.0e-4 itr_max 50" &
+  "fix             3 interface polarize/bem/icc 1 1.0e-4" &
  "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" &
 elif "${method} == dof" &
-  "fix             3 interface polarize/functional 1 0.001" &
+  "fix             3 interface polarize/functional 1 0.0001" &
  "fix_modify      3 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" &
 else &
  "print 'Unsupported polarization solver' "
--- a/examples/PACKAGES/dielectric/in.nopbc
+++ b/examples/PACKAGES/dielectric/in.nopbc
@ -25,10 +25,10 @@ pair_coeff      1 1 0.0 1.0
 neigh_modify    one 5000

 #compute         ef all efield/atom
-dump            1 all custom 100 all.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
-dump            2 interface custom 100 interface.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
-
-dump_modify     1 sort id
+#dump            1 all custom 100 all.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#
+#dump            2 interface custom 100 interface.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id

 fix             1 ions nve

--- a/examples/PACKAGES/dielectric/log.07Mar23.confined.dof.g++.1
+++ b/examples/PACKAGES/dielectric/log.07Mar23.confined.dof.g++.1
@ -0,0 +1,192 @@
+LAMMPS (8 Feb 2023)
+  using 1 OpenMP thread(s) per MPI task
+# Two ions, a cation and an anion, confined between two interfaces: epsilon1 | epsilon2  | epsilon1
+# The interface normal vectors should be consistent with ed, pointing from region with epsilon1 to that with epsilon2
+# bottom interface: n = (0, 0, 1)
+# top interface:    n = (0, 0, -1)
+# so that ed's are the same for both interfaces
+
+# Dielectric constants can be set to be different from the input data file
+
+variable epsilon1 index 20
+variable epsilon2 index 10
+
+variable    data index data.confined
+
+newton off
+units       lj
+atom_style  dielectric
+atom_modify map array
+dimension   3
+boundary    p p f
+
+variable    method index gmres  # gmres = BEM/GMRES
+                                # icc   = BEM/ICC*
+                                # dof   = Direct optimization of the functional
+                                # none
+
+# compute the relevant values for the interface particles
+
+variable ed      equal "v_epsilon2 - v_epsilon1"
+variable em      equal "(v_epsilon2 + v_epsilon1)/2"
+variable epsilon equal 1.0                             # epsilon at the patch, not used for now
+variable area    equal 0.866                           # patch area, same as in the data file
+
+read_data   ${data}
+read_data   data.confined
+Reading data file ...
+  orthogonal box = (0 0 0) to (40.000006 43.301277 40)
+  1 by 1 by 1 MPI processor grid
+WARNING: Atom style in data file differs from currently defined atom style (src/read_data.cpp:620)
+  reading atoms ...
+  4002 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.001 seconds
+  read_data CPU = 0.013 seconds
+
+group interface type 1
+4000 atoms in group interface
+group ions type 2 3
+2 atoms in group ions
+
+group cations type 2
+1 atoms in group cations
+group anions  type 3
+1 atoms in group anions
+
+# set the dielectric constant of the medium where the ions reside
+
+set group cations epsilon ${epsilon2}
+set group cations epsilon 10
+Setting atom values ...
+  1 settings made for epsilon
+set group anions  epsilon ${epsilon2}
+set group anions  epsilon 10
+Setting atom values ...
+  1 settings made for epsilon
+
+pair_style      lj/cut/coul/long/dielectric 1.122 10.0
+pair_coeff      * * 1.0 1.0
+pair_coeff      1 1 0.0 1.0
+
+kspace_style    pppm/dielectric 0.0001
+kspace_modify   slab 3.0
+
+neigh_modify    every 1 delay 0 check yes one 5000
+
+#compute         ef all efield/atom
+#dump            1 all custom 100 all.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump            2 interface custom 100 interface.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id
+
+#dump            3 ions custom 100 ions.dump id mol type q x y z fx fy fz #c_ef[1] c_ef[2] c_ef[3]
+
+fix             1 ions nve
+
+# fix modify is used to set the properties of the interface particle group
+
+if "${method} == gmres" then   "fix             3 interface polarize/bem/gmres 1 1.0e-4"   "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" elif "${method} == icc"  "fix             3 interface polarize/bem/icc 1 1.0e-4"   "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" elif "${method} == dof"   "fix             3 interface polarize/functional 1 0.0001"   "fix_modify      3 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" else   "print 'Unsupported polarization solver' "
+fix             3 interface polarize/functional 1 0.0001
+fix_modify      3 dielectrics ${ed} ${em} ${epsilon} ${area} NULL
+fix_modify      3 dielectrics -10 ${em} ${epsilon} ${area} NULL
+fix_modify      3 dielectrics -10 15 ${epsilon} ${area} NULL
+fix_modify      3 dielectrics -10 15 1 ${area} NULL
+fix_modify      3 dielectrics -10 15 1 0.866 NULL
+
+thermo          1000
+thermo_style    custom step evdwl ecoul elong epair #f_3
+thermo_modify   flush yes
+
+run             0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- DIELECTRIC package: doi:10.1016/j.cpc.2019.03.006
+
+@Article{TrungCPC19,
+ author = {Trung Dac Nguyen and Honghao Li and Debarshee Bagchi and   Francisco J. Solis and Olvera de la Cruz, Monica}
+ title = {Incorporating Surface Polarization Effects Into Large-Scale
+   Coarse-Grained Molecular Dynamics Simulation},
+ journal = {Comput.\ Phys.\ Commun.},
+ year =    2019,
+ volume =  241,
+ pages =   {80--91}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+PPPM initialization ...
+  using 12-bit tables for long-range coulomb (src/kspace.cpp:342)
+  G vector (1/distance) = 0.24260797
+  grid = 12 12 36
+  stencil order = 5
+  estimated absolute RMS force accuracy = 2.5219574e-07
+  estimated relative force accuracy = 2.5219574e-07
+  using double precision KISS FFT
+  3d grid and FFT values/proc = 10982 5184
+Generated 0 of 3 mixed pair_coeff terms from geometric mixing rule
+Direct solver using a variational approach for 4000 induced charges
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 5000, page size: 100000
+  master list distance cutoff = 10.3
+  ghost atom cutoff = 10.3
+  binsize = 5.15, bins = 8 9 8
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair lj/cut/coul/long/dielectric, perpetual
+      attributes: full, newton off
+      pair build: full/bin
+      stencil: full/bin/3d
+      bin: standard
+  (2) fix polarize/functional, perpetual, copy from (1)
+      attributes: full, newton off
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 753.9 | 753.9 | 753.9 Mbytes
+   Step         E_vdwl         E_coul         E_long         E_pair    
+         0   0              0             -1.8534698e-05 -1.8534698e-05
+Loop time of 1.622e-06 on 1 procs for 0 steps with 4002 atoms
+
+185.0% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Kspace  | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 1.622e-06  |            |       |100.00
+
+Nlocal:           4002 ave        4002 max        4002 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:           4832 ave        4832 max        4832 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:              0 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:  1.51316e+06 ave 1.51316e+06 max 1.51316e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 1513160
+Ave neighs/atom = 378.10095
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+
+
+
+
+
+Total wall time: 0:00:05
--- a/examples/PACKAGES/dielectric/log.07Mar23.confined.dof.g++.4
+++ b/examples/PACKAGES/dielectric/log.07Mar23.confined.dof.g++.4
@ -0,0 +1,192 @@
+LAMMPS (8 Feb 2023)
+  using 1 OpenMP thread(s) per MPI task
+# Two ions, a cation and an anion, confined between two interfaces: epsilon1 | epsilon2  | epsilon1
+# The interface normal vectors should be consistent with ed, pointing from region with epsilon1 to that with epsilon2
+# bottom interface: n = (0, 0, 1)
+# top interface:    n = (0, 0, -1)
+# so that ed's are the same for both interfaces
+
+# Dielectric constants can be set to be different from the input data file
+
+variable epsilon1 index 20
+variable epsilon2 index 10
+
+variable    data index data.confined
+
+newton off
+units       lj
+atom_style  dielectric
+atom_modify map array
+dimension   3
+boundary    p p f
+
+variable    method index gmres  # gmres = BEM/GMRES
+                                # icc   = BEM/ICC*
+                                # dof   = Direct optimization of the functional
+                                # none
+
+# compute the relevant values for the interface particles
+
+variable ed      equal "v_epsilon2 - v_epsilon1"
+variable em      equal "(v_epsilon2 + v_epsilon1)/2"
+variable epsilon equal 1.0                             # epsilon at the patch, not used for now
+variable area    equal 0.866                           # patch area, same as in the data file
+
+read_data   ${data}
+read_data   data.confined
+Reading data file ...
+  orthogonal box = (0 0 0) to (40.000006 43.301277 40)
+  2 by 2 by 1 MPI processor grid
+WARNING: Atom style in data file differs from currently defined atom style (src/read_data.cpp:620)
+  reading atoms ...
+  4002 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.013 seconds
+
+group interface type 1
+4000 atoms in group interface
+group ions type 2 3
+2 atoms in group ions
+
+group cations type 2
+1 atoms in group cations
+group anions  type 3
+1 atoms in group anions
+
+# set the dielectric constant of the medium where the ions reside
+
+set group cations epsilon ${epsilon2}
+set group cations epsilon 10
+Setting atom values ...
+  1 settings made for epsilon
+set group anions  epsilon ${epsilon2}
+set group anions  epsilon 10
+Setting atom values ...
+  1 settings made for epsilon
+
+pair_style      lj/cut/coul/long/dielectric 1.122 10.0
+pair_coeff      * * 1.0 1.0
+pair_coeff      1 1 0.0 1.0
+
+kspace_style    pppm/dielectric 0.0001
+kspace_modify   slab 3.0
+
+neigh_modify    every 1 delay 0 check yes one 5000
+
+#compute         ef all efield/atom
+#dump            1 all custom 100 all.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump            2 interface custom 100 interface.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id
+
+#dump            3 ions custom 100 ions.dump id mol type q x y z fx fy fz #c_ef[1] c_ef[2] c_ef[3]
+
+fix             1 ions nve
+
+# fix modify is used to set the properties of the interface particle group
+
+if "${method} == gmres" then   "fix             3 interface polarize/bem/gmres 1 1.0e-4"   "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" elif "${method} == icc"  "fix             3 interface polarize/bem/icc 1 1.0e-4"   "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" elif "${method} == dof"   "fix             3 interface polarize/functional 1 0.0001"   "fix_modify      3 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" else   "print 'Unsupported polarization solver' "
+fix             3 interface polarize/functional 1 0.0001
+fix_modify      3 dielectrics ${ed} ${em} ${epsilon} ${area} NULL
+fix_modify      3 dielectrics -10 ${em} ${epsilon} ${area} NULL
+fix_modify      3 dielectrics -10 15 ${epsilon} ${area} NULL
+fix_modify      3 dielectrics -10 15 1 ${area} NULL
+fix_modify      3 dielectrics -10 15 1 0.866 NULL
+
+thermo          1000
+thermo_style    custom step evdwl ecoul elong epair #f_3
+thermo_modify   flush yes
+
+run             0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- DIELECTRIC package: doi:10.1016/j.cpc.2019.03.006
+
+@Article{TrungCPC19,
+ author = {Trung Dac Nguyen and Honghao Li and Debarshee Bagchi and   Francisco J. Solis and Olvera de la Cruz, Monica}
+ title = {Incorporating Surface Polarization Effects Into Large-Scale
+   Coarse-Grained Molecular Dynamics Simulation},
+ journal = {Comput.\ Phys.\ Commun.},
+ year =    2019,
+ volume =  241,
+ pages =   {80--91}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+PPPM initialization ...
+  using 12-bit tables for long-range coulomb (src/kspace.cpp:342)
+  G vector (1/distance) = 0.24260797
+  grid = 12 12 36
+  stencil order = 5
+  estimated absolute RMS force accuracy = 2.5219574e-07
+  estimated relative force accuracy = 2.5219574e-07
+  using double precision KISS FFT
+  3d grid and FFT values/proc = 4598 1296
+Generated 0 of 3 mixed pair_coeff terms from geometric mixing rule
+Direct solver using a variational approach for 4000 induced charges
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 5000, page size: 100000
+  master list distance cutoff = 10.3
+  ghost atom cutoff = 10.3
+  binsize = 5.15, bins = 8 9 8
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair lj/cut/coul/long/dielectric, perpetual
+      attributes: full, newton off
+      pair build: full/bin
+      stencil: full/bin/3d
+      bin: standard
+  (2) fix polarize/functional, perpetual, copy from (1)
+      attributes: full, newton off
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 744 | 744 | 744 Mbytes
+   Step         E_vdwl         E_coul         E_long         E_pair    
+         0   0              0             -1.8534698e-05 -1.8534698e-05
+Loop time of 2.36225e-06 on 4 procs for 0 steps with 4002 atoms
+
+148.2% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Kspace  | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 2.362e-06  |            |       |100.00
+
+Nlocal:         1000.5 ave        1001 max        1000 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Nghost:         2889.5 ave        2890 max        2889 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Neighs:              0 ave           0 max           0 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+FullNghs:       378290 ave      378465 max      378117 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+
+Total # of neighbors = 1513160
+Ave neighs/atom = 378.10095
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+
+
+
+
+
+Total wall time: 0:00:02
--- a/examples/PACKAGES/dielectric/log.07Mar23.confined.gmres.g++.1
+++ b/examples/PACKAGES/dielectric/log.07Mar23.confined.gmres.g++.1
@ -0,0 +1,187 @@
+LAMMPS (8 Feb 2023)
+  using 1 OpenMP thread(s) per MPI task
+# Two ions, a cation and an anion, confined between two interfaces: epsilon1 | epsilon2  | epsilon1
+# The interface normal vectors should be consistent with ed, pointing from region with epsilon1 to that with epsilon2
+# bottom interface: n = (0, 0, 1)
+# top interface:    n = (0, 0, -1)
+# so that ed's are the same for both interfaces
+
+# Dielectric constants can be set to be different from the input data file
+
+variable epsilon1 index 20
+variable epsilon2 index 10
+
+variable    data index data.confined
+
+newton off
+units       lj
+atom_style  dielectric
+atom_modify map array
+dimension   3
+boundary    p p f
+
+variable    method index gmres  # gmres = BEM/GMRES
+                                # icc   = BEM/ICC*
+                                # dof   = Direct optimization of the functional
+                                # none
+
+# compute the relevant values for the interface particles
+
+variable ed      equal "v_epsilon2 - v_epsilon1"
+variable em      equal "(v_epsilon2 + v_epsilon1)/2"
+variable epsilon equal 1.0                             # epsilon at the patch, not used for now
+variable area    equal 0.866                           # patch area, same as in the data file
+
+read_data   ${data}
+read_data   data.confined
+Reading data file ...
+  orthogonal box = (0 0 0) to (40.000006 43.301277 40)
+  1 by 1 by 1 MPI processor grid
+WARNING: Atom style in data file differs from currently defined atom style (src/read_data.cpp:620)
+  reading atoms ...
+  4002 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.013 seconds
+
+group interface type 1
+4000 atoms in group interface
+group ions type 2 3
+2 atoms in group ions
+
+group cations type 2
+1 atoms in group cations
+group anions  type 3
+1 atoms in group anions
+
+# set the dielectric constant of the medium where the ions reside
+
+set group cations epsilon ${epsilon2}
+set group cations epsilon 10
+Setting atom values ...
+  1 settings made for epsilon
+set group anions  epsilon ${epsilon2}
+set group anions  epsilon 10
+Setting atom values ...
+  1 settings made for epsilon
+
+pair_style      lj/cut/coul/long/dielectric 1.122 10.0
+pair_coeff      * * 1.0 1.0
+pair_coeff      1 1 0.0 1.0
+
+kspace_style    pppm/dielectric 0.0001
+kspace_modify   slab 3.0
+
+neigh_modify    every 1 delay 0 check yes one 5000
+
+#compute         ef all efield/atom
+#dump            1 all custom 100 all.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump            2 interface custom 100 interface.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id
+
+#dump            3 ions custom 100 ions.dump id mol type q x y z fx fy fz #c_ef[1] c_ef[2] c_ef[3]
+
+fix             1 ions nve
+
+# fix modify is used to set the properties of the interface particle group
+
+if "${method} == gmres" then   "fix             3 interface polarize/bem/gmres 1 1.0e-4"   "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" elif "${method} == icc"  "fix             3 interface polarize/bem/icc 1 1.0e-4"   "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" elif "${method} == dof"   "fix             3 interface polarize/functional 1 0.0001"   "fix_modify      3 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" else   "print 'Unsupported polarization solver' "
+fix             3 interface polarize/bem/gmres 1 1.0e-4
+fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 ${em} ${epsilon} ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 15 ${epsilon} ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 15 1 ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 15 1 0.866 NULL
+
+thermo          1000
+thermo_style    custom step evdwl ecoul elong epair #f_3
+thermo_modify   flush yes
+
+run             0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- DIELECTRIC package: doi:10.1016/j.cpc.2019.03.006
+
+@Article{TrungCPC19,
+ author = {Trung Dac Nguyen and Honghao Li and Debarshee Bagchi and   Francisco J. Solis and Olvera de la Cruz, Monica}
+ title = {Incorporating Surface Polarization Effects Into Large-Scale
+   Coarse-Grained Molecular Dynamics Simulation},
+ journal = {Comput.\ Phys.\ Commun.},
+ year =    2019,
+ volume =  241,
+ pages =   {80--91}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+PPPM initialization ...
+  using 12-bit tables for long-range coulomb (src/kspace.cpp:342)
+  G vector (1/distance) = 0.24260797
+  grid = 12 12 36
+  stencil order = 5
+  estimated absolute RMS force accuracy = 2.5219574e-07
+  estimated relative force accuracy = 2.5219574e-07
+  using double precision KISS FFT
+  3d grid and FFT values/proc = 10982 5184
+Generated 0 of 3 mixed pair_coeff terms from geometric mixing rule
+BEM/GMRES solver for 4000 induced charges using maximum 3999 q-vectors
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 5000, page size: 100000
+  master list distance cutoff = 10.3
+  ghost atom cutoff = 10.3
+  binsize = 5.15, bins = 8 9 8
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/long/dielectric, perpetual
+      attributes: full, newton off
+      pair build: full/bin
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 381.6 | 381.6 | 381.6 Mbytes
+   Step         E_vdwl         E_coul         E_long         E_pair    
+         0   0             -1.7228107e-08 -1.8534756e-05 -1.8551984e-05
+Loop time of 1.939e-06 on 1 procs for 0 steps with 4002 atoms
+
+103.1% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Kspace  | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 1.939e-06  |            |       |100.00
+
+Nlocal:           4002 ave        4002 max        4002 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:           4832 ave        4832 max        4832 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:              0 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:  1.51316e+06 ave 1.51316e+06 max 1.51316e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 1513160
+Ave neighs/atom = 378.10095
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+
+
+
+
+
+Total wall time: 0:00:00
--- a/examples/PACKAGES/dielectric/log.07Mar23.confined.gmres.g++.4
+++ b/examples/PACKAGES/dielectric/log.07Mar23.confined.gmres.g++.4
@ -0,0 +1,187 @@
+LAMMPS (8 Feb 2023)
+  using 1 OpenMP thread(s) per MPI task
+# Two ions, a cation and an anion, confined between two interfaces: epsilon1 | epsilon2  | epsilon1
+# The interface normal vectors should be consistent with ed, pointing from region with epsilon1 to that with epsilon2
+# bottom interface: n = (0, 0, 1)
+# top interface:    n = (0, 0, -1)
+# so that ed's are the same for both interfaces
+
+# Dielectric constants can be set to be different from the input data file
+
+variable epsilon1 index 20
+variable epsilon2 index 10
+
+variable    data index data.confined
+
+newton off
+units       lj
+atom_style  dielectric
+atom_modify map array
+dimension   3
+boundary    p p f
+
+variable    method index gmres  # gmres = BEM/GMRES
+                                # icc   = BEM/ICC*
+                                # dof   = Direct optimization of the functional
+                                # none
+
+# compute the relevant values for the interface particles
+
+variable ed      equal "v_epsilon2 - v_epsilon1"
+variable em      equal "(v_epsilon2 + v_epsilon1)/2"
+variable epsilon equal 1.0                             # epsilon at the patch, not used for now
+variable area    equal 0.866                           # patch area, same as in the data file
+
+read_data   ${data}
+read_data   data.confined
+Reading data file ...
+  orthogonal box = (0 0 0) to (40.000006 43.301277 40)
+  2 by 2 by 1 MPI processor grid
+WARNING: Atom style in data file differs from currently defined atom style (src/read_data.cpp:620)
+  reading atoms ...
+  4002 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.001 seconds
+  read_data CPU = 0.012 seconds
+
+group interface type 1
+4000 atoms in group interface
+group ions type 2 3
+2 atoms in group ions
+
+group cations type 2
+1 atoms in group cations
+group anions  type 3
+1 atoms in group anions
+
+# set the dielectric constant of the medium where the ions reside
+
+set group cations epsilon ${epsilon2}
+set group cations epsilon 10
+Setting atom values ...
+  1 settings made for epsilon
+set group anions  epsilon ${epsilon2}
+set group anions  epsilon 10
+Setting atom values ...
+  1 settings made for epsilon
+
+pair_style      lj/cut/coul/long/dielectric 1.122 10.0
+pair_coeff      * * 1.0 1.0
+pair_coeff      1 1 0.0 1.0
+
+kspace_style    pppm/dielectric 0.0001
+kspace_modify   slab 3.0
+
+neigh_modify    every 1 delay 0 check yes one 5000
+
+#compute         ef all efield/atom
+#dump            1 all custom 100 all.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump            2 interface custom 100 interface.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id
+
+#dump            3 ions custom 100 ions.dump id mol type q x y z fx fy fz #c_ef[1] c_ef[2] c_ef[3]
+
+fix             1 ions nve
+
+# fix modify is used to set the properties of the interface particle group
+
+if "${method} == gmres" then   "fix             3 interface polarize/bem/gmres 1 1.0e-4"   "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" elif "${method} == icc"  "fix             3 interface polarize/bem/icc 1 1.0e-4"   "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" elif "${method} == dof"   "fix             3 interface polarize/functional 1 0.0001"   "fix_modify      3 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" else   "print 'Unsupported polarization solver' "
+fix             3 interface polarize/bem/gmres 1 1.0e-4
+fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 ${em} ${epsilon} ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 15 ${epsilon} ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 15 1 ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 15 1 0.866 NULL
+
+thermo          1000
+thermo_style    custom step evdwl ecoul elong epair #f_3
+thermo_modify   flush yes
+
+run             0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- DIELECTRIC package: doi:10.1016/j.cpc.2019.03.006
+
+@Article{TrungCPC19,
+ author = {Trung Dac Nguyen and Honghao Li and Debarshee Bagchi and   Francisco J. Solis and Olvera de la Cruz, Monica}
+ title = {Incorporating Surface Polarization Effects Into Large-Scale
+   Coarse-Grained Molecular Dynamics Simulation},
+ journal = {Comput.\ Phys.\ Commun.},
+ year =    2019,
+ volume =  241,
+ pages =   {80--91}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+PPPM initialization ...
+  using 12-bit tables for long-range coulomb (src/kspace.cpp:342)
+  G vector (1/distance) = 0.24260797
+  grid = 12 12 36
+  stencil order = 5
+  estimated absolute RMS force accuracy = 2.5219574e-07
+  estimated relative force accuracy = 2.5219574e-07
+  using double precision KISS FFT
+  3d grid and FFT values/proc = 4598 1296
+Generated 0 of 3 mixed pair_coeff terms from geometric mixing rule
+BEM/GMRES solver for 4000 induced charges using maximum 3999 q-vectors
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 5000, page size: 100000
+  master list distance cutoff = 10.3
+  ghost atom cutoff = 10.3
+  binsize = 5.15, bins = 8 9 8
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/long/dielectric, perpetual
+      attributes: full, newton off
+      pair build: full/bin
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 376.2 | 376.2 | 376.2 Mbytes
+   Step         E_vdwl         E_coul         E_long         E_pair    
+         0   0             -1.7228107e-08 -1.8534756e-05 -1.8551984e-05
+Loop time of 1.6531e-05 on 4 procs for 0 steps with 4002 atoms
+
+102.8% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Kspace  | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 1.653e-05  |            |       |100.00
+
+Nlocal:         1000.5 ave        1001 max        1000 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Nghost:         2889.5 ave        2890 max        2889 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Neighs:              0 ave           0 max           0 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+FullNghs:       378290 ave      378465 max      378117 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+
+Total # of neighbors = 1513160
+Ave neighs/atom = 378.10095
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+
+
+
+
+
+Total wall time: 0:00:00
--- a/examples/PACKAGES/dielectric/log.07Mar23.confined.icc.g++.1
+++ b/examples/PACKAGES/dielectric/log.07Mar23.confined.icc.g++.1
@ -0,0 +1,189 @@
+LAMMPS (8 Feb 2023)
+  using 1 OpenMP thread(s) per MPI task
+# Two ions, a cation and an anion, confined between two interfaces: epsilon1 | epsilon2  | epsilon1
+# The interface normal vectors should be consistent with ed, pointing from region with epsilon1 to that with epsilon2
+# bottom interface: n = (0, 0, 1)
+# top interface:    n = (0, 0, -1)
+# so that ed's are the same for both interfaces
+
+# Dielectric constants can be set to be different from the input data file
+
+variable epsilon1 index 20
+variable epsilon2 index 10
+
+variable    data index data.confined
+
+newton off
+units       lj
+atom_style  dielectric
+atom_modify map array
+dimension   3
+boundary    p p f
+
+variable    method index gmres  # gmres = BEM/GMRES
+                                # icc   = BEM/ICC*
+                                # dof   = Direct optimization of the functional
+                                # none
+
+# compute the relevant values for the interface particles
+
+variable ed      equal "v_epsilon2 - v_epsilon1"
+variable em      equal "(v_epsilon2 + v_epsilon1)/2"
+variable epsilon equal 1.0                             # epsilon at the patch, not used for now
+variable area    equal 0.866                           # patch area, same as in the data file
+
+read_data   ${data}
+read_data   data.confined
+Reading data file ...
+  orthogonal box = (0 0 0) to (40.000006 43.301277 40)
+  1 by 1 by 1 MPI processor grid
+WARNING: Atom style in data file differs from currently defined atom style (src/read_data.cpp:620)
+  reading atoms ...
+  4002 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.001 seconds
+  read_data CPU = 0.013 seconds
+
+group interface type 1
+4000 atoms in group interface
+group ions type 2 3
+2 atoms in group ions
+
+group cations type 2
+1 atoms in group cations
+group anions  type 3
+1 atoms in group anions
+
+# set the dielectric constant of the medium where the ions reside
+
+set group cations epsilon ${epsilon2}
+set group cations epsilon 10
+Setting atom values ...
+  1 settings made for epsilon
+set group anions  epsilon ${epsilon2}
+set group anions  epsilon 10
+Setting atom values ...
+  1 settings made for epsilon
+
+pair_style      lj/cut/coul/long/dielectric 1.122 10.0
+pair_coeff      * * 1.0 1.0
+pair_coeff      1 1 0.0 1.0
+
+kspace_style    pppm/dielectric 0.0001
+kspace_modify   slab 3.0
+
+neigh_modify    every 1 delay 0 check yes one 5000
+
+#compute         ef all efield/atom
+#dump            1 all custom 100 all.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump            2 interface custom 100 interface.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id
+
+#dump            3 ions custom 100 ions.dump id mol type q x y z fx fy fz #c_ef[1] c_ef[2] c_ef[3]
+
+fix             1 ions nve
+
+# fix modify is used to set the properties of the interface particle group
+
+if "${method} == gmres" then   "fix             3 interface polarize/bem/gmres 1 1.0e-4"   "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" elif "${method} == icc"  "fix             3 interface polarize/bem/icc 1 1.0e-4"   "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" elif "${method} == dof"   "fix             3 interface polarize/functional 1 0.0001"   "fix_modify      3 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" else   "print 'Unsupported polarization solver' "
+fix             3 interface polarize/bem/icc 1 1.0e-4
+fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 ${em} ${epsilon} ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 15 ${epsilon} ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 15 1 ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 15 1 0.866 NULL
+
+thermo          1000
+thermo_style    custom step evdwl ecoul elong epair #f_3
+thermo_modify   flush yes
+
+run             0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- DIELECTRIC package: doi:10.1016/j.cpc.2019.03.006
+
+@Article{TrungCPC19,
+ author = {Trung Dac Nguyen and Honghao Li and Debarshee Bagchi and   Francisco J. Solis and Olvera de la Cruz, Monica}
+ title = {Incorporating Surface Polarization Effects Into Large-Scale
+   Coarse-Grained Molecular Dynamics Simulation},
+ journal = {Comput.\ Phys.\ Commun.},
+ year =    2019,
+ volume =  241,
+ pages =   {80--91}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+PPPM initialization ...
+  using 12-bit tables for long-range coulomb (src/kspace.cpp:342)
+  G vector (1/distance) = 0.24260797
+  grid = 12 12 36
+  stencil order = 5
+  estimated absolute RMS force accuracy = 2.5219574e-07
+  estimated relative force accuracy = 2.5219574e-07
+  using double precision KISS FFT
+  3d grid and FFT values/proc = 10982 5184
+Generated 0 of 3 mixed pair_coeff terms from geometric mixing rule
+BEM/ICC solver for 4000 induced charges
+ using pair style lj/cut/coul/long/dielectric
+ using kspace style pppm/dielectric
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 5000, page size: 100000
+  master list distance cutoff = 10.3
+  ghost atom cutoff = 10.3
+  binsize = 5.15, bins = 8 9 8
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/long/dielectric, perpetual
+      attributes: full, newton off
+      pair build: full/bin
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 14.99 | 14.99 | 14.99 Mbytes
+   Step         E_vdwl         E_coul         E_long         E_pair    
+         0   0             -1.7228514e-08 -1.8534756e-05 -1.8551985e-05
+Loop time of 1.823e-06 on 1 procs for 0 steps with 4002 atoms
+
+109.7% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Kspace  | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 1.823e-06  |            |       |100.00
+
+Nlocal:           4002 ave        4002 max        4002 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:           4832 ave        4832 max        4832 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:              0 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:  1.51316e+06 ave 1.51316e+06 max 1.51316e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 1513160
+Ave neighs/atom = 378.10095
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+
+
+
+
+
+Total wall time: 0:00:00
--- a/examples/PACKAGES/dielectric/log.07Mar23.confined.icc.g++.4
+++ b/examples/PACKAGES/dielectric/log.07Mar23.confined.icc.g++.4
@ -0,0 +1,189 @@
+LAMMPS (8 Feb 2023)
+  using 1 OpenMP thread(s) per MPI task
+# Two ions, a cation and an anion, confined between two interfaces: epsilon1 | epsilon2  | epsilon1
+# The interface normal vectors should be consistent with ed, pointing from region with epsilon1 to that with epsilon2
+# bottom interface: n = (0, 0, 1)
+# top interface:    n = (0, 0, -1)
+# so that ed's are the same for both interfaces
+
+# Dielectric constants can be set to be different from the input data file
+
+variable epsilon1 index 20
+variable epsilon2 index 10
+
+variable    data index data.confined
+
+newton off
+units       lj
+atom_style  dielectric
+atom_modify map array
+dimension   3
+boundary    p p f
+
+variable    method index gmres  # gmres = BEM/GMRES
+                                # icc   = BEM/ICC*
+                                # dof   = Direct optimization of the functional
+                                # none
+
+# compute the relevant values for the interface particles
+
+variable ed      equal "v_epsilon2 - v_epsilon1"
+variable em      equal "(v_epsilon2 + v_epsilon1)/2"
+variable epsilon equal 1.0                             # epsilon at the patch, not used for now
+variable area    equal 0.866                           # patch area, same as in the data file
+
+read_data   ${data}
+read_data   data.confined
+Reading data file ...
+  orthogonal box = (0 0 0) to (40.000006 43.301277 40)
+  2 by 2 by 1 MPI processor grid
+WARNING: Atom style in data file differs from currently defined atom style (src/read_data.cpp:620)
+  reading atoms ...
+  4002 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.012 seconds
+
+group interface type 1
+4000 atoms in group interface
+group ions type 2 3
+2 atoms in group ions
+
+group cations type 2
+1 atoms in group cations
+group anions  type 3
+1 atoms in group anions
+
+# set the dielectric constant of the medium where the ions reside
+
+set group cations epsilon ${epsilon2}
+set group cations epsilon 10
+Setting atom values ...
+  1 settings made for epsilon
+set group anions  epsilon ${epsilon2}
+set group anions  epsilon 10
+Setting atom values ...
+  1 settings made for epsilon
+
+pair_style      lj/cut/coul/long/dielectric 1.122 10.0
+pair_coeff      * * 1.0 1.0
+pair_coeff      1 1 0.0 1.0
+
+kspace_style    pppm/dielectric 0.0001
+kspace_modify   slab 3.0
+
+neigh_modify    every 1 delay 0 check yes one 5000
+
+#compute         ef all efield/atom
+#dump            1 all custom 100 all.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump            2 interface custom 100 interface.dump id mol type q x y z #fx fy fz c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id
+
+#dump            3 ions custom 100 ions.dump id mol type q x y z fx fy fz #c_ef[1] c_ef[2] c_ef[3]
+
+fix             1 ions nve
+
+# fix modify is used to set the properties of the interface particle group
+
+if "${method} == gmres" then   "fix             3 interface polarize/bem/gmres 1 1.0e-4"   "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" elif "${method} == icc"  "fix             3 interface polarize/bem/icc 1 1.0e-4"   "fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" elif "${method} == dof"   "fix             3 interface polarize/functional 1 0.0001"   "fix_modify      3 dielectrics ${ed} ${em} ${epsilon} ${area} NULL" else   "print 'Unsupported polarization solver' "
+fix             3 interface polarize/bem/icc 1 1.0e-4
+fix_modify      3 itr_max 50 dielectrics ${ed} ${em} ${epsilon} ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 ${em} ${epsilon} ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 15 ${epsilon} ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 15 1 ${area} NULL
+fix_modify      3 itr_max 50 dielectrics -10 15 1 0.866 NULL
+
+thermo          1000
+thermo_style    custom step evdwl ecoul elong epair #f_3
+thermo_modify   flush yes
+
+run             0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- DIELECTRIC package: doi:10.1016/j.cpc.2019.03.006
+
+@Article{TrungCPC19,
+ author = {Trung Dac Nguyen and Honghao Li and Debarshee Bagchi and   Francisco J. Solis and Olvera de la Cruz, Monica}
+ title = {Incorporating Surface Polarization Effects Into Large-Scale
+   Coarse-Grained Molecular Dynamics Simulation},
+ journal = {Comput.\ Phys.\ Commun.},
+ year =    2019,
+ volume =  241,
+ pages =   {80--91}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+PPPM initialization ...
+  using 12-bit tables for long-range coulomb (src/kspace.cpp:342)
+  G vector (1/distance) = 0.24260797
+  grid = 12 12 36
+  stencil order = 5
+  estimated absolute RMS force accuracy = 2.5219574e-07
+  estimated relative force accuracy = 2.5219574e-07
+  using double precision KISS FFT
+  3d grid and FFT values/proc = 4598 1296
+Generated 0 of 3 mixed pair_coeff terms from geometric mixing rule
+BEM/ICC solver for 4000 induced charges
+ using pair style lj/cut/coul/long/dielectric
+ using kspace style pppm/dielectric
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 5000, page size: 100000
+  master list distance cutoff = 10.3
+  ghost atom cutoff = 10.3
+  binsize = 5.15, bins = 8 9 8
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/long/dielectric, perpetual
+      attributes: full, newton off
+      pair build: full/bin
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 9.647 | 9.647 | 9.647 Mbytes
+   Step         E_vdwl         E_coul         E_long         E_pair    
+         0   0             -1.7228514e-08 -1.8534756e-05 -1.8551985e-05
+Loop time of 2.29425e-06 on 4 procs for 0 steps with 4002 atoms
+
+141.7% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Kspace  | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 2.294e-06  |            |       |100.00
+
+Nlocal:         1000.5 ave        1001 max        1000 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Nghost:         2889.5 ave        2890 max        2889 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Neighs:              0 ave           0 max           0 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+FullNghs:       378290 ave      378465 max      378117 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+
+Total # of neighbors = 1513160
+Ave neighs/atom = 378.10095
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+
+
+
+
+
+Total wall time: 0:00:00
--- a/examples/PACKAGES/dielectric/log.07Mar23.nopbc.dof.g++.1
+++ b/examples/PACKAGES/dielectric/log.07Mar23.nopbc.dof.g++.1
@ -0,0 +1,136 @@
+LAMMPS (8 Feb 2023)
+  using 1 OpenMP thread(s) per MPI task
+# Interface
+newton off
+units       lj
+atom_style  dielectric
+atom_modify map array
+dimension   3
+boundary    f f f
+
+variable    method index gmres  # gmres = BEM/GMRES
+                                # icc   = BEM/ICC*
+                                # dof   = Direct optimization of the functional
+                                # none
+
+variable    data index data.sphere
+
+read_data   ${data}
+read_data   data.sphere
+Reading data file ...
+  orthogonal box = (0 0 0) to (100 100 100)
+  1 by 1 by 1 MPI processor grid
+WARNING: Atom style in data file differs from currently defined atom style (src/read_data.cpp:620)
+  reading atoms ...
+  643 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.004 seconds
+
+group interface type 1
+642 atoms in group interface
+group ions type 2 3
+1 atoms in group ions
+
+pair_style      lj/cut/coul/cut/dielectric 1.122 20.0
+pair_coeff      * * 1.0 1.0
+pair_coeff      1 1 0.0 1.0
+
+neigh_modify    one 5000
+
+#compute         ef all efield/atom
+#dump            1 all custom 100 all.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#
+#dump            2 interface custom 100 interface.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id
+
+fix             1 ions nve
+
+if "${method} == gmres" then   "fix             3 interface polarize/bem/gmres 1 1.0e-4" elif "${method} == icc"  "fix             3 interface polarize/bem/icc 1 1.0e-4" elif "${method} == dof"   "fix             3 interface polarize/functional 1 1.0e-4" else   "print 'Unsupported method for polarization' "
+fix             3 interface polarize/functional 1 1.0e-4
+
+thermo          1000
+thermo_style    custom step evdwl ecoul elong epair
+thermo_modify   flush yes
+
+run             0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- DIELECTRIC package: doi:10.1016/j.cpc.2019.03.006
+
+@Article{TrungCPC19,
+ author = {Trung Dac Nguyen and Honghao Li and Debarshee Bagchi and   Francisco J. Solis and Olvera de la Cruz, Monica}
+ title = {Incorporating Surface Polarization Effects Into Large-Scale
+   Coarse-Grained Molecular Dynamics Simulation},
+ journal = {Comput.\ Phys.\ Commun.},
+ year =    2019,
+ volume =  241,
+ pages =   {80--91}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Generated 0 of 3 mixed pair_coeff terms from geometric mixing rule
+Direct solver using a variational approach for 642 induced charges
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 5000, page size: 100000
+  master list distance cutoff = 20.3
+  ghost atom cutoff = 20.3
+  binsize = 10.15, bins = 10 10 10
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair lj/cut/coul/cut/dielectric, perpetual
+      attributes: full, newton off
+      pair build: full/bin
+      stencil: full/bin/3d
+      bin: standard
+  (2) fix polarize/functional, perpetual, copy from (1)
+      attributes: full, newton off
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 29.83 | 29.83 | 29.83 Mbytes
+   Step         E_vdwl         E_coul         E_long         E_pair    
+         0   0             -0.011053355    0             -0.011053355  
+Loop time of 1.691e-06 on 1 procs for 0 steps with 643 atoms
+
+177.4% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 1.691e-06  |            |       |100.00
+
+Nlocal:            643 ave         643 max         643 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:              0 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:              0 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:       412806 ave      412806 max      412806 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 412806
+Ave neighs/atom = 642
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+
+
+
+Total wall time: 0:00:02
--- a/examples/PACKAGES/dielectric/log.07Mar23.nopbc.dof.g++.4
+++ b/examples/PACKAGES/dielectric/log.07Mar23.nopbc.dof.g++.4
@ -0,0 +1,136 @@
+LAMMPS (8 Feb 2023)
+  using 1 OpenMP thread(s) per MPI task
+# Interface
+newton off
+units       lj
+atom_style  dielectric
+atom_modify map array
+dimension   3
+boundary    f f f
+
+variable    method index gmres  # gmres = BEM/GMRES
+                                # icc   = BEM/ICC*
+                                # dof   = Direct optimization of the functional
+                                # none
+
+variable    data index data.sphere
+
+read_data   ${data}
+read_data   data.sphere
+Reading data file ...
+  orthogonal box = (0 0 0) to (100 100 100)
+  1 by 2 by 2 MPI processor grid
+WARNING: Atom style in data file differs from currently defined atom style (src/read_data.cpp:620)
+  reading atoms ...
+  643 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.001 seconds
+  read_data CPU = 0.004 seconds
+
+group interface type 1
+642 atoms in group interface
+group ions type 2 3
+1 atoms in group ions
+
+pair_style      lj/cut/coul/cut/dielectric 1.122 20.0
+pair_coeff      * * 1.0 1.0
+pair_coeff      1 1 0.0 1.0
+
+neigh_modify    one 5000
+
+#compute         ef all efield/atom
+#dump            1 all custom 100 all.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#
+#dump            2 interface custom 100 interface.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id
+
+fix             1 ions nve
+
+if "${method} == gmres" then   "fix             3 interface polarize/bem/gmres 1 1.0e-4" elif "${method} == icc"  "fix             3 interface polarize/bem/icc 1 1.0e-4" elif "${method} == dof"   "fix             3 interface polarize/functional 1 1.0e-4" else   "print 'Unsupported method for polarization' "
+fix             3 interface polarize/functional 1 1.0e-4
+
+thermo          1000
+thermo_style    custom step evdwl ecoul elong epair
+thermo_modify   flush yes
+
+run             0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- DIELECTRIC package: doi:10.1016/j.cpc.2019.03.006
+
+@Article{TrungCPC19,
+ author = {Trung Dac Nguyen and Honghao Li and Debarshee Bagchi and   Francisco J. Solis and Olvera de la Cruz, Monica}
+ title = {Incorporating Surface Polarization Effects Into Large-Scale
+   Coarse-Grained Molecular Dynamics Simulation},
+ journal = {Comput.\ Phys.\ Commun.},
+ year =    2019,
+ volume =  241,
+ pages =   {80--91}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Generated 0 of 3 mixed pair_coeff terms from geometric mixing rule
+Direct solver using a variational approach for 642 induced charges
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 5000, page size: 100000
+  master list distance cutoff = 20.3
+  ghost atom cutoff = 20.3
+  binsize = 10.15, bins = 10 10 10
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair lj/cut/coul/cut/dielectric, perpetual
+      attributes: full, newton off
+      pair build: full/bin
+      stencil: full/bin/3d
+      bin: standard
+  (2) fix polarize/functional, perpetual, copy from (1)
+      attributes: full, newton off
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 26.9 | 27.48 | 27.67 Mbytes
+   Step         E_vdwl         E_coul         E_long         E_pair    
+         0   0             -0.011053355    0             -0.011053355  
+Loop time of 2.96225e-06 on 4 procs for 0 steps with 643 atoms
+
+143.5% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 2.962e-06  |            |       |100.00
+
+Nlocal:         160.75 ave         178 max         145 min
+Histogram: 1 0 0 0 2 0 0 0 0 1
+Nghost:         482.25 ave         498 max         465 min
+Histogram: 1 0 0 0 0 2 0 0 0 1
+Neighs:              0 ave           0 max           0 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+FullNghs:       103202 ave      114276 max       93090 min
+Histogram: 1 0 0 0 2 0 0 0 0 1
+
+Total # of neighbors = 412806
+Ave neighs/atom = 642
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+
+
+
+Total wall time: 0:00:01
--- a/examples/PACKAGES/dielectric/log.07Mar23.nopbc.gmres.g++.1
+++ b/examples/PACKAGES/dielectric/log.07Mar23.nopbc.gmres.g++.1
@ -0,0 +1,131 @@
+LAMMPS (8 Feb 2023)
+  using 1 OpenMP thread(s) per MPI task
+# Interface
+newton off
+units       lj
+atom_style  dielectric
+atom_modify map array
+dimension   3
+boundary    f f f
+
+variable    method index gmres  # gmres = BEM/GMRES
+                                # icc   = BEM/ICC*
+                                # dof   = Direct optimization of the functional
+                                # none
+
+variable    data index data.sphere
+
+read_data   ${data}
+read_data   data.sphere
+Reading data file ...
+  orthogonal box = (0 0 0) to (100 100 100)
+  1 by 1 by 1 MPI processor grid
+WARNING: Atom style in data file differs from currently defined atom style (src/read_data.cpp:620)
+  reading atoms ...
+  643 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.004 seconds
+
+group interface type 1
+642 atoms in group interface
+group ions type 2 3
+1 atoms in group ions
+
+pair_style      lj/cut/coul/cut/dielectric 1.122 20.0
+pair_coeff      * * 1.0 1.0
+pair_coeff      1 1 0.0 1.0
+
+neigh_modify    one 5000
+
+#compute         ef all efield/atom
+#dump            1 all custom 100 all.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#
+#dump            2 interface custom 100 interface.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id
+
+fix             1 ions nve
+
+if "${method} == gmres" then   "fix             3 interface polarize/bem/gmres 1 1.0e-4" elif "${method} == icc"  "fix             3 interface polarize/bem/icc 1 1.0e-4" elif "${method} == dof"   "fix             3 interface polarize/functional 1 1.0e-4" else   "print 'Unsupported method for polarization' "
+fix             3 interface polarize/bem/gmres 1 1.0e-4
+
+thermo          1000
+thermo_style    custom step evdwl ecoul elong epair
+thermo_modify   flush yes
+
+run             0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- DIELECTRIC package: doi:10.1016/j.cpc.2019.03.006
+
+@Article{TrungCPC19,
+ author = {Trung Dac Nguyen and Honghao Li and Debarshee Bagchi and   Francisco J. Solis and Olvera de la Cruz, Monica}
+ title = {Incorporating Surface Polarization Effects Into Large-Scale
+   Coarse-Grained Molecular Dynamics Simulation},
+ journal = {Comput.\ Phys.\ Commun.},
+ year =    2019,
+ volume =  241,
+ pages =   {80--91}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Generated 0 of 3 mixed pair_coeff terms from geometric mixing rule
+BEM/GMRES solver for 642 induced charges using maximum 641 q-vectors
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 5000, page size: 100000
+  master list distance cutoff = 20.3
+  ghost atom cutoff = 20.3
+  binsize = 10.15, bins = 10 10 10
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/cut/dielectric, perpetual
+      attributes: full, newton off
+      pair build: full/bin
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 18.6 | 18.6 | 18.6 Mbytes
+   Step         E_vdwl         E_coul         E_long         E_pair    
+         0   0             -0.011226675    0             -0.011226675  
+Loop time of 1.659e-06 on 1 procs for 0 steps with 643 atoms
+
+120.6% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 1.659e-06  |            |       |100.00
+
+Nlocal:            643 ave         643 max         643 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:              0 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:              0 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:       412806 ave      412806 max      412806 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 412806
+Ave neighs/atom = 642
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+
+
+
+Total wall time: 0:00:00
--- a/examples/PACKAGES/dielectric/log.07Mar23.nopbc.gmres.g++.4
+++ b/examples/PACKAGES/dielectric/log.07Mar23.nopbc.gmres.g++.4
@ -0,0 +1,131 @@
+LAMMPS (8 Feb 2023)
+  using 1 OpenMP thread(s) per MPI task
+# Interface
+newton off
+units       lj
+atom_style  dielectric
+atom_modify map array
+dimension   3
+boundary    f f f
+
+variable    method index gmres  # gmres = BEM/GMRES
+                                # icc   = BEM/ICC*
+                                # dof   = Direct optimization of the functional
+                                # none
+
+variable    data index data.sphere
+
+read_data   ${data}
+read_data   data.sphere
+Reading data file ...
+  orthogonal box = (0 0 0) to (100 100 100)
+  1 by 2 by 2 MPI processor grid
+WARNING: Atom style in data file differs from currently defined atom style (src/read_data.cpp:620)
+  reading atoms ...
+  643 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.003 seconds
+
+group interface type 1
+642 atoms in group interface
+group ions type 2 3
+1 atoms in group ions
+
+pair_style      lj/cut/coul/cut/dielectric 1.122 20.0
+pair_coeff      * * 1.0 1.0
+pair_coeff      1 1 0.0 1.0
+
+neigh_modify    one 5000
+
+#compute         ef all efield/atom
+#dump            1 all custom 100 all.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#
+#dump            2 interface custom 100 interface.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id
+
+fix             1 ions nve
+
+if "${method} == gmres" then   "fix             3 interface polarize/bem/gmres 1 1.0e-4" elif "${method} == icc"  "fix             3 interface polarize/bem/icc 1 1.0e-4" elif "${method} == dof"   "fix             3 interface polarize/functional 1 1.0e-4" else   "print 'Unsupported method for polarization' "
+fix             3 interface polarize/bem/gmres 1 1.0e-4
+
+thermo          1000
+thermo_style    custom step evdwl ecoul elong epair
+thermo_modify   flush yes
+
+run             0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- DIELECTRIC package: doi:10.1016/j.cpc.2019.03.006
+
+@Article{TrungCPC19,
+ author = {Trung Dac Nguyen and Honghao Li and Debarshee Bagchi and   Francisco J. Solis and Olvera de la Cruz, Monica}
+ title = {Incorporating Surface Polarization Effects Into Large-Scale
+   Coarse-Grained Molecular Dynamics Simulation},
+ journal = {Comput.\ Phys.\ Commun.},
+ year =    2019,
+ volume =  241,
+ pages =   {80--91}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Generated 0 of 3 mixed pair_coeff terms from geometric mixing rule
+BEM/GMRES solver for 642 induced charges using maximum 641 q-vectors
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 5000, page size: 100000
+  master list distance cutoff = 20.3
+  ghost atom cutoff = 20.3
+  binsize = 10.15, bins = 10 10 10
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/cut/dielectric, perpetual
+      attributes: full, newton off
+      pair build: full/bin
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 17.2 | 17.49 | 17.58 Mbytes
+   Step         E_vdwl         E_coul         E_long         E_pair    
+         0   0             -0.011226675    0             -0.011226675  
+Loop time of 6.7975e-06 on 4 procs for 0 steps with 643 atoms
+
+80.9% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 6.797e-06  |            |       |100.00
+
+Nlocal:         160.75 ave         178 max         145 min
+Histogram: 1 0 0 0 2 0 0 0 0 1
+Nghost:         482.25 ave         498 max         465 min
+Histogram: 1 0 0 0 0 2 0 0 0 1
+Neighs:              0 ave           0 max           0 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+FullNghs:       103202 ave      114276 max       93090 min
+Histogram: 1 0 0 0 2 0 0 0 0 1
+
+Total # of neighbors = 412806
+Ave neighs/atom = 642
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+
+
+
+Total wall time: 0:00:00
--- a/examples/PACKAGES/dielectric/log.07Mar23.nopbc.icc.g++.1
+++ b/examples/PACKAGES/dielectric/log.07Mar23.nopbc.icc.g++.1
@ -0,0 +1,132 @@
+LAMMPS (8 Feb 2023)
+  using 1 OpenMP thread(s) per MPI task
+# Interface
+newton off
+units       lj
+atom_style  dielectric
+atom_modify map array
+dimension   3
+boundary    f f f
+
+variable    method index gmres  # gmres = BEM/GMRES
+                                # icc   = BEM/ICC*
+                                # dof   = Direct optimization of the functional
+                                # none
+
+variable    data index data.sphere
+
+read_data   ${data}
+read_data   data.sphere
+Reading data file ...
+  orthogonal box = (0 0 0) to (100 100 100)
+  1 by 1 by 1 MPI processor grid
+WARNING: Atom style in data file differs from currently defined atom style (src/read_data.cpp:620)
+  reading atoms ...
+  643 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.004 seconds
+
+group interface type 1
+642 atoms in group interface
+group ions type 2 3
+1 atoms in group ions
+
+pair_style      lj/cut/coul/cut/dielectric 1.122 20.0
+pair_coeff      * * 1.0 1.0
+pair_coeff      1 1 0.0 1.0
+
+neigh_modify    one 5000
+
+#compute         ef all efield/atom
+#dump            1 all custom 100 all.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#
+#dump            2 interface custom 100 interface.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id
+
+fix             1 ions nve
+
+if "${method} == gmres" then   "fix             3 interface polarize/bem/gmres 1 1.0e-4" elif "${method} == icc"  "fix             3 interface polarize/bem/icc 1 1.0e-4" elif "${method} == dof"   "fix             3 interface polarize/functional 1 1.0e-4" else   "print 'Unsupported method for polarization' "
+fix             3 interface polarize/bem/icc 1 1.0e-4
+
+thermo          1000
+thermo_style    custom step evdwl ecoul elong epair
+thermo_modify   flush yes
+
+run             0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- DIELECTRIC package: doi:10.1016/j.cpc.2019.03.006
+
+@Article{TrungCPC19,
+ author = {Trung Dac Nguyen and Honghao Li and Debarshee Bagchi and   Francisco J. Solis and Olvera de la Cruz, Monica}
+ title = {Incorporating Surface Polarization Effects Into Large-Scale
+   Coarse-Grained Molecular Dynamics Simulation},
+ journal = {Comput.\ Phys.\ Commun.},
+ year =    2019,
+ volume =  241,
+ pages =   {80--91}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Generated 0 of 3 mixed pair_coeff terms from geometric mixing rule
+BEM/ICC solver for 642 induced charges
+ using pair style lj/cut/coul/cut/dielectric
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 5000, page size: 100000
+  master list distance cutoff = 20.3
+  ghost atom cutoff = 20.3
+  binsize = 10.15, bins = 10 10 10
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/cut/dielectric, perpetual
+      attributes: full, newton off
+      pair build: full/bin
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 8.898 | 8.898 | 8.898 Mbytes
+   Step         E_vdwl         E_coul         E_long         E_pair    
+         0   0             -0.011226707    0             -0.011226707  
+Loop time of 1.532e-06 on 1 procs for 0 steps with 643 atoms
+
+130.5% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 1.532e-06  |            |       |100.00
+
+Nlocal:            643 ave         643 max         643 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:              0 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:              0 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:       412806 ave      412806 max      412806 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 412806
+Ave neighs/atom = 642
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+
+
+
+Total wall time: 0:00:00
--- a/examples/PACKAGES/dielectric/log.07Mar23.nopbc.icc.g++.4
+++ b/examples/PACKAGES/dielectric/log.07Mar23.nopbc.icc.g++.4
@ -0,0 +1,132 @@
+LAMMPS (8 Feb 2023)
+  using 1 OpenMP thread(s) per MPI task
+# Interface
+newton off
+units       lj
+atom_style  dielectric
+atom_modify map array
+dimension   3
+boundary    f f f
+
+variable    method index gmres  # gmres = BEM/GMRES
+                                # icc   = BEM/ICC*
+                                # dof   = Direct optimization of the functional
+                                # none
+
+variable    data index data.sphere
+
+read_data   ${data}
+read_data   data.sphere
+Reading data file ...
+  orthogonal box = (0 0 0) to (100 100 100)
+  1 by 2 by 2 MPI processor grid
+WARNING: Atom style in data file differs from currently defined atom style (src/read_data.cpp:620)
+  reading atoms ...
+  643 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.001 seconds
+  read_data CPU = 0.004 seconds
+
+group interface type 1
+642 atoms in group interface
+group ions type 2 3
+1 atoms in group ions
+
+pair_style      lj/cut/coul/cut/dielectric 1.122 20.0
+pair_coeff      * * 1.0 1.0
+pair_coeff      1 1 0.0 1.0
+
+neigh_modify    one 5000
+
+#compute         ef all efield/atom
+#dump            1 all custom 100 all.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#
+#dump            2 interface custom 100 interface.dump id mol type q x y z #c_ef[1] c_ef[2] c_ef[3]
+#dump_modify     1 sort id
+
+fix             1 ions nve
+
+if "${method} == gmres" then   "fix             3 interface polarize/bem/gmres 1 1.0e-4" elif "${method} == icc"  "fix             3 interface polarize/bem/icc 1 1.0e-4" elif "${method} == dof"   "fix             3 interface polarize/functional 1 1.0e-4" else   "print 'Unsupported method for polarization' "
+fix             3 interface polarize/bem/icc 1 1.0e-4
+
+thermo          1000
+thermo_style    custom step evdwl ecoul elong epair
+thermo_modify   flush yes
+
+run             0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- DIELECTRIC package: doi:10.1016/j.cpc.2019.03.006
+
+@Article{TrungCPC19,
+ author = {Trung Dac Nguyen and Honghao Li and Debarshee Bagchi and   Francisco J. Solis and Olvera de la Cruz, Monica}
+ title = {Incorporating Surface Polarization Effects Into Large-Scale
+   Coarse-Grained Molecular Dynamics Simulation},
+ journal = {Comput.\ Phys.\ Commun.},
+ year =    2019,
+ volume =  241,
+ pages =   {80--91}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Generated 0 of 3 mixed pair_coeff terms from geometric mixing rule
+BEM/ICC solver for 642 induced charges
+ using pair style lj/cut/coul/cut/dielectric
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 5000, page size: 100000
+  master list distance cutoff = 20.3
+  ghost atom cutoff = 20.3
+  binsize = 10.15, bins = 10 10 10
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/cut/dielectric, perpetual
+      attributes: full, newton off
+      pair build: full/bin
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 7.496 | 7.783 | 7.878 Mbytes
+   Step         E_vdwl         E_coul         E_long         E_pair    
+         0   0             -0.011226707    0             -0.011226707  
+Loop time of 6.43925e-06 on 4 procs for 0 steps with 643 atoms
+
+93.2% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 6.439e-06  |            |       |100.00
+
+Nlocal:         160.75 ave         178 max         145 min
+Histogram: 1 0 0 0 2 0 0 0 0 1
+Nghost:         482.25 ave         498 max         465 min
+Histogram: 1 0 0 0 0 2 0 0 0 1
+Neighs:              0 ave           0 max           0 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+FullNghs:       103202 ave      114276 max       93090 min
+Histogram: 1 0 0 0 2 0 0 0 0 1
+
+Total # of neighbors = 412806
+Ave neighs/atom = 642
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+
+
+
+Total wall time: 0:00:00
--- a/lib/gpu/Makefile.oneapi
+++ b/lib/gpu/Makefile.oneapi
@ -12,13 +12,12 @@ EXTRAMAKE = Makefile.lammps.opencl
 LMP_INC = -DLAMMPS_SMALLBIG

 OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/
-CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -fp-model fast=2 -no-prec-div \
-          -qoverride-limits
-OCL_CPP = mpiicpc -std=c++11 -diag-disable=10441 -DMPICH_IGNORE_CXX_SEEK \
+CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -ffast-math -freciprocal-math
+OCL_CPP = mpiicpc -cxx=icpx -std=c++11 -DMPICH_IGNORE_CXX_SEEK \
          $(LMP_INC) $(OCL_INC) $(CPP_OPT)
 OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
 OCL_PREC = -D_SINGLE_DOUBLE
-OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
+OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -DGERYON_NO_OCL_MARKERS

 BIN_DIR = ./
 OBJ_DIR = ./
--- a/lib/gpu/Makefile.oneapi_prof
+++ b/lib/gpu/Makefile.oneapi_prof
@ -0,0 +1,28 @@
+# /* ----------------------------------------------------------------------   
+#  Linux Makefile for Intel oneAPI - Mixed precision (with timing enabled)
+# ------------------------------------------------------------------------- */
+
+# which file will be copied to Makefile.lammps
+
+EXTRAMAKE = Makefile.lammps.opencl
+
+# this setting should match LAMMPS Makefile
+# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/
+CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -ffast-math -freciprocal-math
+OCL_CPP = mpiicpc -cxx=icpx -std=c++11 -DMPICH_IGNORE_CXX_SEEK \
+          $(LMP_INC) $(OCL_INC) $(CPP_OPT)
+OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
+OCL_PREC = -D_SINGLE_DOUBLE
+OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
+
+BIN_DIR = ./
+OBJ_DIR = ./
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Opencl.makefile
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -266,6 +266,7 @@ LAL_SERIALIZE_INIT      Force serialization of initialization and compilation
                        for multiple MPI tasks sharing the same accelerator.
                        Some accelerator API implementations have had issues
                        with temporary file conflicts in the past.
+LAL_DISABLE_PREFETCH    Disable prefetch in kernels
 GERYON_FORCE_SHARED_MAIN_MEM_ON      Should only be used for builds where the
                                     accelerator is guaranteed to share physical
                                     main memory with the host (e.g. integrated
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -429,7 +429,7 @@ void UCL_Device::clear() {
    CU_SAFE_CALL_NS(cuCtxSetCurrent(_old_context));
    CU_SAFE_CALL_NS(cuDevicePrimaryCtxRelease(_cu_device));
 #else
-    cuCtxDestroy(_context));
+    cuCtxDestroy(_context);
 #endif
  }
  _device=-1;
--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@ -95,7 +95,8 @@ class UCL_Program {

  /// Load a program from a string and compile with flags
  inline int load_string(const void *program, const char *flags="",
-                         std::string *log=nullptr, FILE* foutput=nullptr) {
+                         std::string *log=nullptr, FILE* foutput=nullptr,
+                         const int compile_test=0) {
    cl_int error_flag;
    const char *prog=(const char *)program;
    _program=clCreateProgramWithSource(_context,1,&prog,nullptr,&error_flag);
@ -131,6 +132,8 @@ class UCL_Program {
    }
    #endif

+    if (build_status != CL_SUCCESS && compile_test) return UCL_COMPILE_ERROR;
+
    if (build_status != CL_SUCCESS || log!=NULL) {
      size_t ms;
      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -113,7 +113,7 @@ _texture( q_tex,int2);
    dufld[5]=red_acc[5][tid];                                               \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 t;                                                              \
+    acctyp3 t;                                                              \
    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@ -147,7 +147,7 @@ _texture( q_tex,int2);
    _fieldp[5]=red_acc[5][tid];                                             \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 f, fp;                                                          \
+    acctyp3 f, fp;                                                          \
    f.x = _fieldp[0];                                                       \
    f.y = _fieldp[1];                                                       \
    f.z = _fieldp[2];                                                       \
@ -174,7 +174,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -254,7 +254,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 t;                                                              \
+    acctyp3 t;                                                              \
    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@ -277,7 +277,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 f, fp;                                                          \
+    acctyp3 f, fp;                                                          \
    f.x = _fieldp[0];                                                       \
    f.y = _fieldp[1];                                                       \
    f.z = _fieldp[2];                                                       \
@ -302,7 +302,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -391,7 +391,7 @@ _texture( q_tex,int2);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -416,9 +416,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
-                                 __global acctyp4 *restrict tep,
+                                 __global acctyp3 *restrict tep,
                                 const int eflag, const int vflag, const int inum,
                                 const int nall, const int nbor_pitch,
                                 const int t_per_atom, const numtyp aewald,
@ -431,7 +431,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -440,9 +440,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }

-  acctyp4 tq;
+  acctyp3 tq;
  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
-  
+
  const __global numtyp4* polar1 = &extra[0];
  const __global numtyp4* polar2 = &extra[nall];
  const __global numtyp4* polar3 = &extra[2*nall];
@ -695,7 +695,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict fieldp,
+                                 __global acctyp3 *restrict fieldp,
                                 const int inum,  const int nall,
                                 const int nbor_pitch, const int t_per_atom,
                                 const numtyp aewald, const numtyp off2,
@ -889,7 +889,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict fieldp,
+                                 __global acctyp3 *restrict fieldp,
                                 const int inum,  const int nall,
                                 const int nbor_pitch, const int t_per_atom,
                                 const numtyp aewald, const numtyp off2,
@ -1052,9 +1052,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
                             const __global int *dev_short_nbor,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
-                             __global acctyp4 *restrict tep,
+                             __global acctyp3 *restrict tep,
                             const int eflag, const int vflag, const int inum,
                             const int nall, const int nbor_pitch, const int t_per_atom,
                             const numtyp aewald, const numtyp felec,
@ -1067,7 +1067,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -1082,7 +1082,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
  for (int l=0; l<6; l++) dufld[l]=(acctyp)0;

  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
-  
+
  const __global numtyp4* polar1 = &extra[0];
  const __global numtyp4* polar2 = &extra[nall];
  const __global numtyp4* polar3 = &extra[2*nall];
@ -1226,7 +1226,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      numtyp prc3[3],prc5[3],prc7[3];
      numtyp drc3[3],drc5[3],drc7[3];
      numtyp urc3[3],urc5[3];
-    
+
      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp bn[5];
@ -1583,12 +1583,12 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
  if (ii<inum) {

    const int nlpts = (bsorder-1) / 2;
-    
+
    int istart = fast_mul(ii,4);
    const int igridx = igrid[istart];
    const int igridy = igrid[istart+1];
    const int igridz = igrid[istart+2];
-    
+
    // now istart is used to index thetai1, thetai2 and thetai3
    istart = fast_mul(ii,bsorder);

@ -1782,7 +1782,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
    fdip_buf[7] = tuv110_1;
    fdip_buf[8] = tuv101_1;
    fdip_buf[9] = tuv011_1;
-    idx = ii;    
+    idx = ii;
    for (int m = 0; m < 10; m++) {
      fdip_phi1[idx] = fdip_buf[m];
      idx += inum;
@ -1798,7 +1798,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
    fdip_buf[7] = tuv110_2;
    fdip_buf[8] = tuv101_2;
    fdip_buf[9] = tuv011_2;
-    idx = ii;    
+    idx = ii;
    for (int m = 0; m < 10; m++) {
      fdip_phi2[idx] = fdip_buf[m];
      idx += inum;
@ -1824,7 +1824,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
    fdip_buf[17] = tuv102;
    fdip_buf[18] = tuv012;
    fdip_buf[19] = tuv111;
-    idx = ii;    
+    idx = ii;
    for (int m = 0; m < 20; m++) {
      fdip_sum_phi[idx] = fdip_buf[m];
      idx += inum;
@ -1855,12 +1855,12 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
  if (ii<inum) {

    int nlpts = (bsorder-1) / 2;
-    
+
    int istart = fast_mul(ii,4);
    int igridx = igrid[istart];
    int igridy = igrid[istart+1];
    int igridz = igrid[istart+2];
-    
+
    // now istart is used to index thetai1, thetai2 and thetai3
    istart = fast_mul(ii,bsorder);

@ -1990,7 +1990,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
    buf[18] = tuv012;
    buf[19] = tuv111;

-    int idx = ii;    
+    int idx = ii;
    for (int m = 0; m < 20; m++) {
      fphi[idx] = felec * buf[m];
      idx += inum;
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -28,9 +28,9 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),

 template <class numtyp, class acctyp>
 int AnswerT::bytes_per_atom() const {
-  int bytes=11*sizeof(acctyp);
+  int bytes=10*sizeof(acctyp);
  if (_rot)
-    bytes+=4*sizeof(acctyp);
+    bytes+=3*sizeof(acctyp);
  if (_charge)
    bytes+=sizeof(acctyp);
  return bytes;
@ -42,9 +42,9 @@ bool AnswerT::alloc(const int inum) {

  bool success=true;

-  _ans_fields=4;
+  _ans_fields=3;
  if (_rot)
-    _ans_fields+=4;
+    _ans_fields+=3;

  // ---------------------------  Device allocations
  success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
@ -134,11 +134,11 @@ void AnswerT::clear() {

 template <class numtyp, class acctyp>
 double AnswerT::host_memory_usage() const {
-  int atom_bytes=4;
+  int atom_bytes=3;
  if (_charge)
    atom_bytes+=1;
  if (_rot)
-    atom_bytes+=4;
+    atom_bytes+=3;
  int ans_bytes=atom_bytes+_ev_fields;
  return ans_bytes*(_max_local)*sizeof(acctyp)+
         sizeof(Answer<numtyp,acctyp>);
@ -169,9 +169,9 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
  if (csize>0)
    engv.update_host(_ev_stride*csize,true);
  if (_rot)
-    force.update_host(_inum*4*2,true);
+    force.update_host(_inum*3*2,true);
  else
-    force.update_host(_inum*4,true);
+    force.update_host(_inum*3,true);
  time_answer.stop();

  #ifndef GERYON_OCL_FLUSH
@ -298,10 +298,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
 template <class numtyp, class acctyp>
 void AnswerT::get_answers(double **f, double **tor) {
  if (_ilist==nullptr) {
-    typedef struct { double x,y,z; } vec3d;
-    typedef struct { acctyp x,y,z,w; } vec4d_t;
-    auto fp=reinterpret_cast<vec3d*>(&(f[0][0]));
-    auto forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
+    auto fp=reinterpret_cast<double*>(&(f[0][0]));

    #if (LAL_USE_OMP == 1)
    #pragma omp parallel
@ -310,27 +307,21 @@ void AnswerT::get_answers(double **f, double **tor) {
      #if (LAL_USE_OMP == 1)
      const int nthreads = omp_get_num_threads();
      const int tid = omp_get_thread_num();
-      const int idelta = _inum / nthreads + 1;
+      const int idelta = _inum*3 / nthreads + 1;
      const int ifrom = tid * idelta;
-      const int ito = std::min(ifrom + idelta, _inum);
+      const int ito = std::min(ifrom + idelta, _inum*3);
      #else
      const int ifrom = 0;
-      const int ito = _inum;
+      const int ito = _inum*3;
      #endif

-      for (int i=ifrom; i<ito; i++) {
-        fp[i].x+=forcep[i].x;
-        fp[i].y+=forcep[i].y;
-        fp[i].z+=forcep[i].z;
-      }
+      for (int i=ifrom; i<ito; i++)
+        fp[i]+=force[i];
      if (_rot) {
-        auto torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
-        auto torquep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
-        for (int i=ifrom; i<ito; i++) {
-          torp[i].x+=torquep[i].x;
-          torp[i].y+=torquep[i].y;
-          torp[i].z+=torquep[i].z;
-        }
+        auto torp=reinterpret_cast<double*>(&(tor[0][0]));
+        auto torquep=&(force[_inum*3]);
+        for (int i=ifrom; i<ito; i++)
+          torp[i]+=torquep[i];
      }
    }
  } else {
@ -344,7 +335,7 @@ void AnswerT::get_answers(double **f, double **tor) {
      const int idelta = _inum / nthreads + 1;
      const int ifrom = tid * idelta;
      const int ito = std::min(ifrom + idelta, _inum);
-      int fl=ifrom*4;
+      int fl=ifrom*3;
      #else
      const int ifrom = 0;
      const int ito = _inum;
@ -356,16 +347,16 @@ void AnswerT::get_answers(double **f, double **tor) {
        f[ii][0]+=force[fl];
        f[ii][1]+=force[fl+1];
        f[ii][2]+=force[fl+2];
-        fl+=4;
+        fl+=3;
      }
      if (_rot) {
-        fl=_inum*4 + ifrom*4;
+        fl=_inum*3 + ifrom*3;
        for (int i=ifrom; i<ito; i++) {
          int ii=_ilist[i];
          tor[ii][0]+=force[fl];
          tor[ii][1]+=force[fl+1];
          tor[ii][2]+=force[fl+2];
-          fl+=4;
+          fl+=3;
        }
      }
    }
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -114,7 +114,7 @@ bool AtomT::alloc(const int nall) {
                                UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=q.device.row_bytes();
  }
-  if (_rot && !_host_view) {
+  if (_rot) {
    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                                   UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=quat.device.row_bytes();
@ -182,11 +182,9 @@ bool AtomT::add_fields(const bool charge, const bool rot,
  if (rot && !_rot) {
    _rot=true;
    _other=true;
-    if (!_host_view) {
-      success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
-                                     UCL_READ_ONLY)==UCL_SUCCESS);
-      gpu_bytes+=quat.device.row_bytes();
-    }
+    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=quat.device.row_bytes();
  }

  if (vel && !_vel) {
@ -451,7 +449,7 @@ template <class numtyp, class acctyp>
 void AtomT::compile_kernels(UCL_Device &dev) {
  std::string flags = "";
  atom_program=new UCL_Program(dev);
-  atom_program->load_string(atom,flags,nullptr,screen);
+  atom_program->load_string(atom,flags.c_str(),nullptr,stderr);
  k_cast_x.set_function(*atom_program,"kernel_cast_x");
  _compiled=true;
 }
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@ -18,7 +18,7 @@
 #endif

 __kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
-                            const __global numtyp *restrict x,
+                            const __global double *restrict x,
                            const __global int *restrict type,
                            const int nall) {
  int ii=GLOBAL_ID_X;
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -52,6 +52,12 @@ using namespace ucl_cudadr;

 namespace LAMMPS_AL {

+struct EllipsoidBonus {
+  double shape[3];
+  double quat[4];
+  int ilocal;
+};
+
 template <class numtyp, class acctyp>
 class Atom {
 public:
@ -306,8 +312,8 @@ class Atom {
    if (_x_avail==false) {
      double t=MPI_Wtime();
      #ifdef GPU_CAST
-      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
-      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
+      memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double));
+      memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int));
      #else
      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
      vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0]));
@ -351,6 +357,24 @@ class Atom {
    add_x_data(host_ptr,host_type);
  }

+  // Cast mu data to write buffer (stored in quat)
+  template<class cpytyp>
+  inline void cast_mu_data(cpytyp *host_ptr) {
+    if (_quat_avail==false) {
+      double t=MPI_Wtime();
+      if (sizeof(numtyp)==sizeof(double))
+        memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
+      else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
+        for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
+      _time_cast+=MPI_Wtime()-t;
+    }
+  }
+
  // Cast charges to write buffer
  template<class cpytyp>
  inline void cast_q_data(cpytyp *host_ptr) {
@ -384,22 +408,24 @@ class Atom {
  }

  // Cast quaternions to write buffer
-  template<class cpytyp>
-  inline void cast_quat_data(cpytyp *host_ptr) {
+  inline void cast_quat_data(const int *ellipsoid,
+                             const EllipsoidBonus *bonus) {
    if (_quat_avail==false) {
      double t=MPI_Wtime();
-      if (_host_view) {
-        quat.host.view((numtyp*)host_ptr,_nall*4,*dev);
-        quat.device.view(quat.host);
-      } else if (sizeof(numtyp)==sizeof(double))
-        memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
-      else
-        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
-        #pragma omp parallel for simd schedule(static)
-        #elif (LAL_USE_OMP_SIMD == 1)
-        #pragma omp simd
-        #endif
-        for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
+      #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+      #pragma omp parallel for simd schedule(static)
+      #elif (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i=0; i<_nall; i++) {
+        int qi = ellipsoid[i];
+        if (qi > -1) {
+          quat[i*4] = bonus[qi].quat[0];
+          quat[i*4+1] = bonus[qi].quat[1];
+          quat[i*4+2] = bonus[qi].quat[2];
+          quat[i*4+3] = bonus[qi].quat[3];
+        }
+      }
      _time_cast+=MPI_Wtime()-t;
    }
  }
@ -419,10 +445,6 @@ class Atom {
  inline void cast_v_data(double **host_ptr, const tagint *host_tag) {
    if (_v_avail==false) {
      double t=MPI_Wtime();
-      #ifdef GPU_CAST
-      memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
-      memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
-      #else
      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
      vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0]));
      #if (LAL_USE_OMP == 1)
@ -434,7 +456,6 @@ class Atom {
        vp[i].z=host_p[i].z;
        vp[i].w=host_tag[i];
      }
-      #endif
      _time_cast+=MPI_Wtime()-t;
    }
  }
@ -444,16 +465,7 @@ class Atom {
  inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) {
    time_vel.start();
    if (_v_avail==false) {
-      #ifdef GPU_CAST
-      v_cast.update_device(_nall*3,true);
-      tag_cast.update_device(_nall,true);
-      int block_size=64;
-      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
-      k_cast_x.set_size(GX,block_size);
-      k_cast_x.run(&v, &v_cast, &tag_cast, &_nall);
-      #else
      v.update_device(_nall*4,true);
-      #endif
      _v_avail=true;
    }
    time_vel.stop();
@ -519,7 +531,7 @@ class Atom {
  UCL_Vector<numtyp4,numtyp4> extra;

  #ifdef GPU_CAST
-  UCL_Vector<numtyp,numtyp> x_cast;
+  UCL_Vector<double,double> x_cast;
  UCL_Vector<int,int> type_cast;
  #endif

--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -143,10 +143,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);

  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _tep.alloc(_max_tep_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);

  _max_fieldp_size = _max_tep_size;
-  _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _fieldp.alloc(_max_fieldp_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);

  _max_thetai_size = 0;

@ -387,7 +387,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,

  if (inum_full>_max_tep_size) {
    _max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _tep.resize(_max_tep_size*4);
+    _tep.resize(_max_tep_size*3);
  }
  *tep_ptr=_tep.host.begin();

@ -403,7 +403,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,

  // copy tep from device to host

-  _tep.update_host(_max_tep_size*4,false);
+  _tep.update_host(_max_tep_size*3,false);
 }

 // ---------------------------------------------------------------------------
@ -429,7 +429,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double

  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)

-  _fieldp.update_host(_max_fieldp_size*8,false);
+  _fieldp.update_host(_max_fieldp_size*6,false);
 }

 // ---------------------------------------------------------------------------
@ -456,7 +456,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
  // NOTE: move this step to update_fieldp() to delay device-host transfer
  //       after umutual1 and self are done on the GPU
  // *fieldp_ptr=_fieldp.host.begin();
-  // _fieldp.update_host(_max_fieldp_size*8,false);
+  // _fieldp.update_host(_max_fieldp_size*6,false);
 }

 // ---------------------------------------------------------------------------
@ -732,7 +732,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
  device->add_ans_object(ans);

  // copy tep from device to host
-  _tep.update_host(_max_tep_size*4,false);
+  _tep.update_host(_max_tep_size*3,false);
 }

 // ---------------------------------------------------------------------------
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -233,7 +233,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,

  atom->cast_x_data(host_x,host_type);
  atom->cast_q_data(host_q);
-  atom->cast_quat_data(host_mu[0]);
+  atom->cast_mu_data(host_mu[0]);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);
  atom->add_q_data();
@ -297,12 +297,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
    if (!success)
      return nullptr;
    atom->cast_q_data(host_q);
-    atom->cast_quat_data(host_mu[0]);
+    atom->cast_mu_data(host_mu[0]);
    hd_balancer.start_timer();
  } else {
    atom->cast_x_data(host_x,host_type);
    atom->cast_q_data(host_q);
-    atom->cast_quat_data(host_mu[0]);
+    atom->cast_mu_data(host_mu[0]);
    hd_balancer.start_timer();
    atom->add_x_data(host_x,host_type);
  }
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@ -375,7 +375,8 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
                             const bool eflag_in, const bool vflag_in,
                             const bool eatom, const bool vatom,
                             int &host_start, const double cpu_time,
-                             bool &success, double **host_quat) {
+                             bool &success, const int *ellipsoid,
+                             const EllipsoidBonus *bonus) {
  acc_timers();
  int eflag, vflag;
  if (eflag_in) eflag=2;
@ -409,7 +410,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
    list=ilist;

  atom->cast_x_data(host_x,host_type);
-  atom->cast_quat_data(host_quat[0]);
+  atom->cast_quat_data(ellipsoid,bonus);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);
  atom->add_quat_data();
@ -433,7 +434,8 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full,
                              const bool eatom, const bool vatom,
                              int &host_start, int **ilist, int **jnum,
                              const double cpu_time, bool &success,
-                              double **host_quat) {
+                              const int *ellipsoid,
+                              const EllipsoidBonus *bonus) {
  acc_timers();
  int eflag, vflag;
  if (eflag_in) eflag=2;
@ -460,11 +462,11 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full,
                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return nullptr;
-    atom->cast_quat_data(host_quat[0]);
+    atom->cast_quat_data(ellipsoid,bonus);
    hd_balancer.start_timer();
  } else {
    atom->cast_x_data(host_x,host_type);
-    atom->cast_quat_data(host_quat[0]);
+    atom->cast_quat_data(ellipsoid,bonus);
    hd_balancer.start_timer();
    atom->add_x_data(host_x,host_type);
  }
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@ -170,7 +170,8 @@ class BaseEllipsoid {
               double **host_x, int *host_type, int *ilist, int *numj,
               int **firstneigh, const bool eflag, const bool vflag,
               const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, double **quat);
+               const double cpu_time, bool &success,
+               const int *ellipsoid, const EllipsoidBonus *bonus);

  /// Pair loop with device neighboring
  int**compute(const int ago, const int inum_full, const int nall,
@ -179,7 +180,7 @@ class BaseEllipsoid {
               tagint **special, const bool eflag, const bool vflag,
               const bool eatom, const bool vatom, int &host_start,
               int **ilist, int **numj, const double cpu_time, bool &success,
-               double **host_quat);
+               const int *ellipsoid, const EllipsoidBonus *bonus);

  // -------------------------- DEVICE DATA -------------------------

--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@ -31,7 +31,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -130,7 +131,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -150,7 +151,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
    beck2[tid]=beck2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -172,6 +173,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -32,7 +32,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@ -48,7 +48,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -67,6 +67,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -123,7 +124,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -144,7 +145,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -166,6 +167,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -36,7 +36,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -60,7 +60,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -80,6 +80,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -158,7 +159,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -183,7 +184,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -206,6 +207,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_born_coul_long_cs.cu
+++ b/lib/gpu/lal_born_coul_long_cs.cu
@ -51,7 +51,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -75,7 +75,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -95,6 +95,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -192,7 +193,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -217,7 +218,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -240,6 +241,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -38,7 +38,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -63,7 +63,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -89,6 +89,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -174,7 +175,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -200,7 +201,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -229,6 +230,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_born_coul_wolf_cs.cu
+++ b/lib/gpu/lal_born_coul_wolf_cs.cu
@ -39,7 +39,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -64,7 +64,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -90,6 +90,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -176,7 +177,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -202,7 +203,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -231,6 +232,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@ -31,7 +31,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag,  const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -120,7 +121,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -141,7 +142,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -163,6 +164,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@ -36,7 +36,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -151,7 +152,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -177,7 +178,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -200,6 +201,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@ -36,7 +36,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -60,7 +60,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -80,6 +80,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -159,7 +160,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
                                    const __global numtyp *restrict sp_lj_in,
                                    const __global int *dev_nbor,
                                    const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans,
+                                    __global acctyp3 *restrict ans,
                                    __global acctyp *restrict engv,
                                    const int eflag, const int vflag,
                                    const int inum, const int nbor_pitch,
@ -185,7 +186,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -208,6 +209,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_charmm.cu
+++ b/lib/gpu/lal_charmm.cu
@ -34,7 +34,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
                       const __global numtyp *restrict sp_lj,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
-                       __global acctyp4 *restrict ans,
+                       __global acctyp3 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag,
                       const int inum, const int nbor_pitch,
@ -53,7 +53,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_bio();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -73,6 +73,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -159,7 +160,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag,
                            const int inum, const int nbor_pitch,
@ -187,7 +188,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -209,6 +210,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -35,7 +35,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch,
@ -50,7 +50,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_bio();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -70,6 +70,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -156,7 +157,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict sp_lj_in,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
                                 const int inum, const int nbor_pitch,
@ -181,7 +182,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -203,6 +204,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@ -34,7 +34,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
                        const __global int *form,
                        const __global int *dev_nbor,
                        const __global int *dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
@ -50,7 +50,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -69,6 +69,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -188,7 +189,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
                             const __global int *form_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
@ -215,7 +216,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -237,6 +238,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@ -35,7 +35,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_cl_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch,
@ -54,7 +54,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
  sp_cl[2]=sp_cl_in[2];
  sp_cl[3]=sp_cl_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -74,6 +74,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,

    numtyp factor_coul;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_coul = sp_cl[sbmask(j)];
@ -125,7 +126,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_cl_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -146,7 +147,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
    cutsq[tid]=_cutsq[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -169,6 +170,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,

    numtyp factor_coul;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_coul = sp_cl[sbmask(j)];
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@ -35,7 +35,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_cl_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
@ -55,7 +55,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
  sp_cl[2]=sp_cl_in[2];
  sp_cl[3]=sp_cl_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -75,6 +75,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,

    numtyp factor_coul;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_coul = sp_cl[sbmask(j)];
@ -129,7 +130,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_cl_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch,
@ -153,7 +154,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
    cutsq[tid]=_cutsq[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -176,6 +177,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,

    numtyp factor_coul;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_coul = sp_cl[sbmask(j)];
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@ -36,7 +36,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_lj_in,
                         const __global int *dev_nbor,
                         const __global int *dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch,
@ -56,7 +56,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -81,6 +81,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_coul, r, prefactor, erfcc;
@ -138,7 +139,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                              __global acctyp4 *restrict ans,
+                              __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch,
@ -156,7 +157,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
  if (tid<4)
    sp_lj[tid]=sp_lj_in[tid];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -183,6 +184,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_coul, r, prefactor, erfcc;
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -35,7 +35,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_cl_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -54,7 +54,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
  sp_cl[2]=sp_cl_in[2];
  sp_cl[3]=sp_cl_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp e_coul, virial[6];
  if (EVFLAG) {
@ -73,6 +73,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
    numtyp qtmp; fetch(qtmp,i,q_tex);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_coul;
@ -132,7 +133,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_cl_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -152,7 +153,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
    scale[tid]=scale_in[tid];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp e_coul, virial[6];
  if (EVFLAG) {
@ -174,6 +175,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_coul;
--- a/lib/gpu/lal_coul_long_cs.cu
+++ b/lib/gpu/lal_coul_long_cs.cu
@ -49,7 +49,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_cl_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -68,7 +68,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
  sp_cl[2]=sp_cl_in[2];
  sp_cl[3]=sp_cl_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp e_coul, virial[6];
  if (EVFLAG) {
@ -87,6 +87,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
    numtyp qtmp; fetch(qtmp,i,q_tex);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_coul;
@ -166,7 +167,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_cl_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -186,7 +187,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
    scale[tid]=scale_in[tid];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp e_coul, virial[6];
  if (EVFLAG) {
@ -208,6 +209,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_coul;
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -26,6 +26,22 @@
 #if defined(USE_OPENCL)
 #include "device_cl.h"

+const char *ocl_prefetch_test =
+"  #if (NBOR_PREFETCH == 1)                                                \n"\
+"  inline void ucl_prefetch(const __global int *p) { prefetch(p, 1); }     \n"\
+"  #else                                                                   \n"\
+"  enum LSC_LDCC {LSC_LDCC_DEFAULT, LSC_LDCC_L1UC_L3UC, LSC_LDCC_L1UC_L3C, \n"\
+"                 LSC_LDCC_L1C_L3UC, LSC_LDCC_L1C_L3C, LSC_LDCC_L1S_L3UC,  \n"\
+"                 LSC_LDCC_L1S_L3C, LSC_LDCC_L1IAR_L3C, };                 \n"\
+"  void __builtin_IB_lsc_prefetch_global_uint(const __global uint *, int,  \n"\
+"                                             enum LSC_LDCC);              \n"\
+"  inline void ucl_prefetch(const __global int *p) {                       \n"\
+"    __builtin_IB_lsc_prefetch_global_uint((const __global uint *)p, 0,    \n"\
+"                                          LSC_LDCC_L1C_L3UC);             \n"\
+"  }                                                                       \n"\
+"  #endif                                                                  \n"\
+"  __kernel void ptest(__global int *i) { ucl_prefetch(i); i[0]++; }       \n";
+
 #ifdef LAL_OCL_EXTRA_ARGS
 #define LAL_DM_STRINGIFY(x) #x
 #define LAL_PRE_STRINGIFY(x) LAL_DM_STRINGIFY(x)
@ -370,7 +386,7 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)

  _ocl_config_name="CUSTOM";
  int token_count=0;
-  std::string params[18];
+  std::string params[19];
  char ocl_config[2048];
  strncpy(ocl_config,s_config.c_str(),2047);
  char *pch = strtok(ocl_config,",");
@ -378,7 +394,7 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
  pch = strtok(nullptr,",");
  if (pch == nullptr) return -11;
  while (pch != nullptr) {
-    if (token_count==18)
+    if (token_count==19)
      return -11;
    params[token_count]=pch;
    token_count++;
@ -389,11 +405,39 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
  #ifdef CL_VERSION_2_0
  _ocl_compile_string+="-cl-std=CL2.0 ";
  #endif
-  // workaround for double precision with Intel OpenCL
-  #ifdef _DOUBLE_DOUBLE
-  if (params[0] == "500") params[4] = "0";
+  if (params[0]=="500") {
+    _ocl_compile_string+="-DINTEL_OCL ";
+    #ifdef _DOUBLE_DOUBLE
+    // workaround for double precision with Intel OpenCL
+    params[4]="0";
+    #endif
+  }
+
+  // Test OCL JIT to make sure any prefetch options are supported
+  #ifdef LAL_DISABLE_PREFETCH
+  params[18]="0";
  #endif
-  if (params[4] != "0") _ocl_compile_string+="-cl-fast-relaxed-math ";
+  _nbor_prefetch=-1;
+  if (params[18]=="2") {
+    _nbor_prefetch=2;
+    UCL_Program ptest(*gpu);
+    std::string ptest_args=_ocl_compile_string+" -DNBOR_PREFETCH="+params[18];
+    int success=ptest.load_string(ocl_prefetch_test,ptest_args.c_str(),
+                                  nullptr,nullptr,1);
+    if (success!=UCL_SUCCESS) params[18]="1";
+  }
+  if (params[18]=="1") {
+    _nbor_prefetch=1;
+    UCL_Program ptest(*gpu);
+    std::string ptest_args=_ocl_compile_string+" -DNBOR_PREFETCH="+params[18];
+    int success=ptest.load_string(ocl_prefetch_test,ptest_args.c_str(),
+                                  nullptr,nullptr,1);
+    if (success!=UCL_SUCCESS) params[18]="0";
+  }
+  if (_nbor_prefetch<0) params[18]="0";
+  if (params[18]=="0") _nbor_prefetch=0;
+
+  if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
  _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
    std::string(OCL_PRECISION_COMPILE);
  if (gpu->has_subgroup_support())
@ -425,7 +469,8 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)

                         " -DMAX_SHARED_TYPES="+params[15]+
                         " -DMAX_BIO_SHARED_TYPES="+params[16]+
-                         " -DPPPM_MAX_SPLINE="+params[17];
+                         " -DPPPM_MAX_SPLINE="+params[17]+
+                         " -DNBOR_PREFETCH="+params[18];
  _ocl_compile_string += extra_args;
  #endif
  return 0;
@ -562,7 +607,11 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
    return -3;

  if (_user_cell_size<0.0) {
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    _neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size());
+    #else
    _neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size());
+    #endif
  } else
    _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,nbor->simd_size());
  nbor->set_cutoff(cutoff);
@ -833,6 +882,10 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
      fprintf(screen,"Average split:   %.4f.\n",avg_split);
      fprintf(screen,"Lanes / atom:    %d.\n",threads_per_atom);
      fprintf(screen,"Vector width:    %d.\n", simd_size());
+      fprintf(screen,"Prefetch mode:   ");
+      if (_nbor_prefetch==2) fprintf(screen,"Intrinsics.\n");
+      else if (_nbor_prefetch==1) fprintf(screen,"API.\n");
+      else fprintf(screen,"None.\n");
      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
      if (nbor.gpu_nbor()==2)
        fprintf(screen,"CPU Neighbor:    %.4f s.\n",times[8]/_replica_size);
@ -958,7 +1011,7 @@ int DeviceT::compile_kernels() {
  k_info.set_function(*dev_program,"kernel_info");
  _compiled=true;

-  UCL_Vector<int,int> gpu_lib_data(19,*gpu,UCL_NOT_PINNED);
+  UCL_Vector<int,int> gpu_lib_data(20,*gpu,UCL_NOT_PINNED);
  k_info.set_size(1,1);
  k_info.run(&gpu_lib_data);
  gpu_lib_data.update_host(false);
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@ -52,4 +52,5 @@ __kernel void kernel_info(__global int *info) {
  info[16]=MAX_SHARED_TYPES;
  info[17]=MAX_BIO_SHARED_TYPES;
  info[18]=PPPM_MAX_SPLINE;
+  info[19]=NBOR_PREFETCH;
 }
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -346,6 +346,7 @@ class Device {
  int _block_pair, _block_bio_pair, _block_ellipse;
  int _pppm_block, _block_nbor_build, _block_cell_2d, _block_cell_id;
  int _max_shared_types, _max_bio_shared_types, _pppm_max_spline;
+  int _nbor_prefetch;

  UCL_Program *dev_program;
  UCL_Kernel k_zero, k_info;
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@ -211,7 +211,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -235,7 +235,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
@ -257,6 +257,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -282,8 +283,8 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
        numtyp rinv, r3inv, r5inv, r7inv;
        numtyp pre1, pre2, pre3, pre4;
        numtyp pdotp, pidotr, pjdotr;
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;

        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
@ -418,7 +419,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -445,7 +446,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
@ -470,6 +471,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -494,8 +496,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
        numtyp rinv, r3inv, r5inv, r7inv;
        numtyp pre1, pre2, pre3, pre4;
        numtyp pdotp, pidotr, pjdotr;
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;

        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@ -212,7 +212,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
@ -236,7 +236,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
@ -258,6 +258,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -286,8 +287,8 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
        numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
        numtyp4 aforcecoul, bforcecoul;

-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;

        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
@ -450,7 +451,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -478,7 +479,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
@ -503,6 +504,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -530,8 +532,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
        numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
        numtyp4 aforcecoul, bforcecoul;

-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;

        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
--- a/lib/gpu/lal_dipole_long_lj.cu
+++ b/lib/gpu/lal_dipole_long_lj.cu
@ -213,7 +213,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -238,7 +238,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
@ -264,6 +264,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -291,8 +292,8 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
        numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz;
        numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1;
        numtyp fdx,fdy,fdz,fax,fay,faz;
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;

        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
@ -462,7 +463,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -490,7 +491,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
@ -519,6 +520,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -545,8 +547,8 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
        numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz;
        numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1;
        numtyp fdx,fdy,fdz,fax,fay,faz;
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;

        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@ -168,7 +168,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
                    const __global numtyp *restrict sp_sqrt,
                    const __global int * dev_nbor,
                    const __global int * dev_packed,
-                    __global acctyp4 *restrict ans,
+                    __global acctyp3 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
                    const int nbor_pitch,
@ -183,7 +183,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -203,6 +203,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,

    numtyp factor_dpd, factor_sqrt;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_dpd = sp_lj[sbmask(j)];
@ -284,7 +285,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_sqrt_in,
                         const __global int * dev_nbor,
                         const __global int * dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch,
@ -318,7 +319,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -343,6 +344,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
    numtyp factor_dpd, factor_sqrt;
    #endif
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      #ifndef ONETYPE
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@ -246,6 +246,7 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
    tfrho=type2frho[itype];

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      j &= NEIGHMASK;

@ -332,6 +333,7 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
    #endif

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      j &= NEIGHMASK;

@ -376,7 +378,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
                    const __global numtyp *cutsq,
                    const __global int *dev_nbor,
                    const __global int *dev_packed,
-                    __global acctyp4 *ans,
+                    __global acctyp3 *ans,
                    __global acctyp *engv,
                    const int eflag, const int vflag,  const int inum,
                    const int nbor_pitch, const int ntypes,
@ -388,7 +390,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_answers_eam();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -407,6 +409,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      j &= NEIGHMASK;

@ -487,7 +490,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
                         const __global numtyp *cutsq,
                         const __global int *dev_nbor,
                         const __global int *dev_packed,
-                         __global acctyp4 *ans,
+                         __global acctyp3 *ans,
                         __global acctyp *engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const numtyp cutforcesq,
@ -510,7 +513,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
  int n_stride;
  local_allocate_store_answers_eam();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -532,6 +535,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
    #endif

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@ -152,7 +152,7 @@ _texture_2d( quat_tex,int4);
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -224,7 +224,7 @@ _texture_2d( quat_tex,int4);
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@ -30,7 +30,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict sp_lj,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
-                      __global acctyp4 *restrict ans,
+                      __global acctyp3 *restrict ans,
                      __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
@ -40,7 +40,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -59,6 +59,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -109,7 +110,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
@ -127,7 +128,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
    gauss1[tid]=gauss1_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -149,6 +150,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@ -80,6 +80,9 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
                    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
 }

+#ifdef INTEL_OCL
+__attribute__((intel_reqd_sub_group_size(16)))
+#endif
 __kernel void k_gayberne(const __global numtyp4 *restrict x_,
                         const __global numtyp4 *restrict q,
                         const __global numtyp4 *restrict shape,
@ -90,7 +93,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict lshape,
                         const __global int *dev_nbor,
                         const int stride,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         const int astride,
                         __global acctyp *restrict engv,
                         __global int *restrict err_flag,
@ -108,7 +111,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
  sp_lj[2]=gum[5];
  sp_lj[3]=gum[6];

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, virial[6];
@ -138,6 +141,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
--- a/lib/gpu/lal_gayberne_ext.cpp
+++ b/lib/gpu/lal_gayberne_ext.cpp
@ -108,28 +108,33 @@ int** compute(const int ago, const int inum_full, const int nall,
                tagint **special, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
-                double **host_quat);
+                const int *ellipsoid, const EllipsoidBonus *bonus);

 int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, double *sublo,
-                       double *subhi, tagint *tag, int **nspecial, tagint **special,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start, int **ilist,
-                       int **jnum, const double cpu_time, bool &success,
-                       double **host_quat) {
+                       double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum, const double cpu_time,
+                       bool &success, const int *ellipsoid,
+                       const void *bonus) {
  return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
                      tag, nspecial, special, eflag, vflag, eatom, vatom,
-                      host_start, ilist, jnum, cpu_time, success, host_quat);
+                      host_start, ilist, jnum, cpu_time, success,
+                      ellipsoid,
+                      static_cast<const EllipsoidBonus *>(bonus));
 }

 int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, int *ilist, int *numj,
                     int **firstneigh, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, double **host_quat) {
+                     const double cpu_time, bool &success,
+                     const int *ellipsoid, const void *bonus) {
  return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
                      numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
-                      cpu_time, success, host_quat);
+                      cpu_time, success, ellipsoid,
+                      static_cast<const EllipsoidBonus *>(bonus));
 }

 // ---------------------------------------------------------------------------
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@ -34,7 +34,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
                                          const __global numtyp *restrict lshape,
                                          const __global int *dev_nbor,
                                          const int stride,
-                                          __global acctyp4 *restrict ans,
+                                          __global acctyp3 *restrict ans,
                                          __global acctyp *restrict engv,
                                          __global int *restrict err_flag,
                                          const int eflag, const int vflag,
@ -53,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
  sp_lj[2]=gum[5];
  sp_lj[3]=gum[6];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -75,6 +75,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);

      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -259,7 +260,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict gum,
                            const int stride,
                            const __global int *dev_ij,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            __global int *restrict err_flag,
                            const int eflag, const int vflag, const int start,
@ -277,7 +278,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
  sp_lj[2]=gum[5];
  sp_lj[3]=gum[6];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -296,6 +297,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);

      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -347,7 +349,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict gum,
                                 const int stride,
                                 const __global int *dev_ij,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 __global int *restrict err_flag,
                                 const int eflag, const int vflag,
@ -371,7 +373,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -393,6 +395,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);

      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@ -113,7 +113,7 @@ _texture( q_tex,int2);
    dufld[5]=red_acc[5][tid];                                               \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 t;                                                              \
+    acctyp3 t;                                                              \
    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@ -147,7 +147,7 @@ _texture( q_tex,int2);
    _fieldp[5]=red_acc[5][tid];                                             \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 f, fp;                                                          \
+    acctyp3 f, fp;                                                          \
    f.x = _fieldp[0];                                                       \
    f.y = _fieldp[1];                                                       \
    f.z = _fieldp[2];                                                       \
@ -174,7 +174,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -254,7 +254,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 t;                                                              \
+    acctyp3 t;                                                              \
    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@ -277,7 +277,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 f, fp;                                                          \
+    acctyp3 f, fp;                                                          \
    f.x = _fieldp[0];                                                       \
    f.y = _fieldp[1];                                                       \
    f.z = _fieldp[2];                                                       \
@ -302,7 +302,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -391,7 +391,7 @@ _texture( q_tex,int2);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -416,9 +416,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
                                const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
-                                __global acctyp4 *restrict tep,
+                                __global acctyp3 *restrict tep,
                                const int eflag, const int vflag, const int inum,
                                const int nall, const int nbor_pitch,
                                const int t_per_atom, const numtyp aewald,
@ -432,7 +432,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -441,7 +441,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }

-  acctyp4 tq;
+  acctyp3 tq;
  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;

  const __global numtyp4* polar1 = &extra[0];
@ -634,7 +634,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
      frcx = sizik * frcx;
      frcy = sizik * frcy;
      frcz = sizik * frcz;
-      
+
      // compute the torque components for this interaction

      numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) -
@ -717,7 +717,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag, const int inum,
                                 const int nall, const int nbor_pitch,
@ -730,7 +730,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -895,9 +895,9 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
                                const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
-                                __global acctyp4 *restrict tep,
+                                __global acctyp3 *restrict tep,
                                const int eflag, const int vflag, const int inum,
                                const int nall, const int nbor_pitch,
                                const int t_per_atom, const numtyp aewald,
@ -910,7 +910,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -919,7 +919,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }

-  acctyp4 tq;
+  acctyp3 tq;
  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;

  const __global numtyp4* polar1 = &extra[0];
@ -1210,7 +1210,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
                                const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict fieldp,
+                                __global acctyp3 *restrict fieldp,
                                const int inum,  const int nall,
                                const int nbor_pitch, const int t_per_atom,
                                const numtyp aewald, const numtyp off2,
@ -1390,7 +1390,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
                                const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict fieldp,
+                                __global acctyp3 *restrict fieldp,
                                const int inum,  const int nall,
                                const int nbor_pitch, const int t_per_atom,
                                const numtyp aewald, const numtyp off2,
@ -1541,9 +1541,9 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
                            const __global int *dev_short_nbor,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
-                            __global acctyp4 *restrict tep,
+                            __global acctyp3 *restrict tep,
                            const int eflag, const int vflag, const int inum,
                            const int nall, const int nbor_pitch, const int t_per_atom,
                            const numtyp aewald, const numtyp felec,
@ -1556,7 +1556,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -1697,7 +1697,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;

      // calculate the real space Ewald error function terms
-      
+
      int m;
      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
@ -2003,12 +2003,12 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
  if (ii<inum) {

    const int nlpts = (bsorder-1) / 2;
-    
+
    int istart = fast_mul(ii,4);
    const int igridx = igrid[istart];
    const int igridy = igrid[istart+1];
    const int igridz = igrid[istart+2];
-    
+
    // now istart is used to index thetai1, thetai2 and thetai3
    istart = fast_mul(ii,bsorder);

@ -2202,7 +2202,7 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
    fdip_buf[7] = tuv110_1;
    fdip_buf[8] = tuv101_1;
    fdip_buf[9] = tuv011_1;
-    idx = ii;    
+    idx = ii;
    for (int m = 0; m < 10; m++) {
      fdip_phi1[idx] = fdip_buf[m];
      idx += inum;
@ -2218,7 +2218,7 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
    fdip_buf[7] = tuv110_2;
    fdip_buf[8] = tuv101_2;
    fdip_buf[9] = tuv011_2;
-    idx = ii;    
+    idx = ii;
    for (int m = 0; m < 10; m++) {
      fdip_phi2[idx] = fdip_buf[m];
      idx += inum;
@ -2244,7 +2244,7 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
    fdip_buf[17] = tuv102;
    fdip_buf[18] = tuv012;
    fdip_buf[19] = tuv111;
-    idx = ii;    
+    idx = ii;
    for (int m = 0; m < 20; m++) {
      fdip_sum_phi[idx] = fdip_buf[m];
      idx += inum;
@ -2275,12 +2275,12 @@ __kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1,
  if (ii<inum) {

    int nlpts = (bsorder-1) / 2;
-    
+
    int istart = fast_mul(ii,4);
    int igridx = igrid[istart];
    int igridy = igrid[istart+1];
    int igridz = igrid[istart+2];
-    
+
    // now istart is used to index thetai1, thetai2 and thetai3
    istart = fast_mul(ii,bsorder);

@ -2410,7 +2410,7 @@ __kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1,
    buf[18] = tuv012;
    buf[19] = tuv111;

-    int idx = ii;    
+    int idx = ii;
    for (int m = 0; m < 20; m++) {
      fphi[idx] = buf[m];
      idx += inum;
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@ -31,7 +31,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
                   const __global numtyp *restrict sp_lj,
                   const __global int * dev_nbor,
                   const __global int * dev_packed,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
                   const int nbor_pitch, const int t_per_atom) {
@ -41,7 +41,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -59,6 +59,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -110,7 +111,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
                        const __global numtyp *restrict sp_lj_in,
                        const __global int * dev_nbor,
                        const __global int * dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
@ -144,7 +145,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -166,6 +167,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,

    NOUNROLL
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      #ifndef ONETYPE
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@ -31,7 +31,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -118,7 +119,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -139,7 +140,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -161,6 +162,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@ -36,7 +36,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag,  const int vflag,
                               const int inum, const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -156,7 +157,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
                                    const __global numtyp *restrict sp_lj_in,
                                    const __global int *dev_nbor,
                                    const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans,
+                                    __global acctyp3 *restrict ans,
                                    __global acctyp *restrict engv,
                                    const int eflag, const int vflag,
                                    const int inum, const int nbor_pitch,
@ -182,7 +183,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -205,6 +206,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@ -36,7 +36,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
                        const __global numtyp *restrict sp_lj_in,
                        const __global int *dev_nbor,
                        const __global int *dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -147,7 +148,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
@ -173,7 +174,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -196,6 +197,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@ -36,7 +36,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_lj_in,
                         const __global int *dev_nbor,
                         const __global int *dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch,
@ -60,7 +60,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -80,6 +80,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -154,7 +155,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                              __global acctyp4 *restrict ans,
+                              __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch,
@ -181,7 +182,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -204,6 +205,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@ -36,7 +36,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -154,7 +155,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
@ -178,7 +179,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -201,6 +202,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@ -94,7 +94,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
@ -117,7 +117,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -139,6 +139,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
    numtyp cut_coul = ucl_sqrt(cut_coulsq);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -215,7 +216,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict sp_lj_in,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
                                 const int inum,  const int nbor_pitch,
@ -239,7 +240,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -264,6 +265,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
    numtyp cut_coul = ucl_sqrt(cut_coulsq);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_cubic.cu
+++ b/lib/gpu/lal_lj_cubic.cu
@ -39,7 +39,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_lj,
                         const __global int * dev_nbor,
                         const __global int * dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
@ -49,7 +49,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -67,6 +67,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -132,7 +133,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
-                              __global acctyp4 *restrict ans,
+                              __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
@ -155,7 +156,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -176,6 +177,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@ -38,7 +38,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
                       const __global numtyp *restrict sp_lj_in,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
-                       __global acctyp4 *restrict ans,
+                       __global acctyp3 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
                       const int nbor_pitch,
@ -62,7 +62,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -88,6 +88,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul, r, prefactor, erfcc;
@ -165,7 +166,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch,
@ -190,7 +191,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -219,6 +220,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul, r, prefactor, erfcc;
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@ -33,7 +33,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -49,7 +49,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -68,6 +68,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -122,7 +123,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch, const int t_per_atom) {
@ -143,7 +144,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -165,6 +166,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_expand_coul_long.cu
+++ b/lib/gpu/lal_lj_expand_coul_long.cu
@ -36,7 +36,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -158,7 +159,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
@ -181,7 +182,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
    lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -204,6 +205,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_gromacs.cu
+++ b/lib/gpu/lal_lj_gromacs.cu
@ -34,7 +34,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
@ -50,7 +50,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -68,6 +68,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -134,7 +135,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch, const int t_per_atom) {
@ -156,7 +157,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
    ljsw[tid]=ljsw_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -177,6 +178,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_smooth.cu
+++ b/lib/gpu/lal_lj_smooth.cu
@ -33,7 +33,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
                   const __global numtyp *restrict sp_lj,
                   const __global int * dev_nbor,
                   const __global int * dev_packed,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
                   const int nbor_pitch, const int t_per_atom) {
@ -43,7 +43,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -61,8 +61,9 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,

    numtyp force, r6inv, factor_lj, forcelj;
    numtyp r, t, tsq, fskin;
-    
+
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -76,10 +77,10 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
-      
+
      int mtype=itype*lj_types+jtype;
      if (rsq<lj1[mtype].z) {
-        
+
        numtyp r2inv=ucl_recip(rsq);
        if (rsq < lj1[mtype].w) {
          r6inv = r2inv*r2inv*r2inv;
@ -135,7 +136,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
                        const __global numtyp *restrict sp_lj_in,
                        const __global int * dev_nbor,
                        const __global int * dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
@ -169,7 +170,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -194,6 +195,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,

    NOUNROLL
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      #ifndef ONETYPE
      factor_lj = sp_lj[sbmask(j)];
@ -236,7 +238,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
          if (rsq < lj1[mtype].w)
            e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z;
          else
-            e = ljsw0[mtype].x - ljsw[mtype].x*t - 
+            e = ljsw0[mtype].x - ljsw[mtype].x*t -
              ljsw[mtype].y*tsq/2.0 - ljsw[mtype].z*tsq*t/3.0 -
              ljsw[mtype].w*tsq*tsq/4.0 - lj3[mtype].z; //???

--- a/lib/gpu/lal_lj_spica.cu
+++ b/lib/gpu/lal_lj_spica.cu
@ -31,7 +31,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
                       const __global numtyp *restrict sp_lj_in,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
-                       __global acctyp4 *restrict ans,
+                       __global acctyp3 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
                       const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -128,7 +129,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
@ -149,7 +150,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -171,6 +172,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_spica_long.cu
+++ b/lib/gpu/lal_lj_spica_long.cu
@ -36,7 +36,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag,  const int vflag, const int inum,
                            const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -166,7 +167,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict sp_lj_in,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
                                 const int inum, const int nbor_pitch,
@ -189,7 +190,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
    lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -212,6 +213,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_tip4p_long.cu
+++ b/lib/gpu/lal_lj_tip4p_long.cu
@ -59,7 +59,7 @@ _texture( q_tex,int2);

 /* ----------------------------------------------------------------------
   GPU analogue of Atom::map inline method,
-   but now limited to map_array mapping style. 
+   but now limited to map_array mapping style.
   Map global ID to local index of atom.
 ---------------------------------------------------------------------- */
 ucl_inline int atom_mapping(const __global int *map, tagint glob) {
@ -134,16 +134,16 @@ ucl_inline void compute_newsite(int iO, int  iH1, int  iH2,

 /* ----------------------------------------------------------------------
   Compute resulting forces (ans), energies and virial (engv).
-   An additional term is calculated based on the previously 
-   calculated values on the virlual sites (ansO), 
-   which should be distributed over the real atoms. 
+   An additional term is calculated based on the previously
+   calculated values on the virlual sites (ansO),
+   which should be distributed over the real atoms.
   For some hydrogens, the corresponding oxygens are
   not local atoms and the ansO value is not calculated.
   The required increase is calculated directly in the main function.
 ---------------------------------------------------------------------- */
 __kernel void k_lj_tip4p_long_distrib(
    const __global numtyp4 *restrict x_,
-    __global acctyp4 *restrict ans,
+    __global acctyp3 *restrict ans,
    __global acctyp *restrict engv,
    const int eflag, const int vflag, const int inum,
    const int nbor_pitch, const int t_per_atom,
@ -151,11 +151,11 @@ __kernel void k_lj_tip4p_long_distrib(
    const __global numtyp4 *restrict m,
    const int typeO, const int typeH,
    const numtyp alpha,
-    const __global numtyp *restrict q_, 
+    const __global numtyp *restrict q_,
    const __global acctyp4 *restrict ansO) {

  int i = BLOCK_ID_X*(BLOCK_SIZE_X)+THREAD_ID_X;
-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;

  if (i<inum) {
@ -208,7 +208,7 @@ __kernel void k_lj_tip4p_long_distrib(
        engv[inum*engv_iter + i] += vM.z * (acctyp)(1 - alpha);
      }
    }
-    acctyp4 old=ans[i];
+    acctyp3 old=ans[i];
    old.x+=f.x;
    old.y+=f.y;
    old.z+=f.z;
@ -219,7 +219,7 @@ __kernel void k_lj_tip4p_long_distrib(
 /* ----------------------------------------------------------------------
   Rebuild hneigh after the neighbor build.
   hneight stores local IDs of H1 and H2 for each local and ghost O
-   and local ID of O for each local H. 
+   and local ID of O for each local H.
 ---------------------------------------------------------------------- */
 __kernel void k_lj_tip4p_reneigh(
    const __global numtyp4 *restrict x_,
@ -230,7 +230,7 @@ __kernel void k_lj_tip4p_reneigh(
    __global int *restrict hneigh,
    __global numtyp4 *restrict m,
    const int typeO, const int typeH,
-    const __global tagint *restrict tag, 
+    const __global tagint *restrict tag,
    const __global int *restrict map,
    const __global int *restrict sametag) {

@ -298,7 +298,7 @@ __kernel void k_lj_tip4p_newsite(const __global numtyp4 *restrict x_,
      iO  = i;
      numtyp qO; fetch(qO,iO,q_tex);
      if (iH1>=0 && iH2>=0) {
-      	compute_newsite(iO,iH1,iH2, &m[iO], qO, alpha, x_);
+        compute_newsite(iO,iH1,iH2, &m[iO], qO, alpha, x_);
      } else {
        m[iO] = ix;
        m[iO].w = qO;
@ -313,9 +313,9 @@ __kernel void k_lj_tip4p_newsite(const __global numtyp4 *restrict x_,
 /* ----------------------------------------------------------------------
   Compute initial value of force, energy and virial for each local particle.
   The values calculated on oxygens use the virtual charge position (m) and
-   they are stored in a separate  array (ansO) for further distribution 
+   they are stored in a separate  array (ansO) for further distribution
   in a separate kernel. For some hydrogens located on the boundary
-   of the local region, oxygens are non-local and the contribution 
+   of the local region, oxygens are non-local and the contribution
   of oxygen is calculated separately in this kernel for them .
 ---------------------------------------------------------------------- */
 __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
@ -325,7 +325,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
    const __global numtyp *restrict sp_lj,
    const __global int * dev_nbor,
    const __global int * dev_packed,
-    __global acctyp4 *restrict ans,
+    __global acctyp3 *restrict ans,
    __global acctyp *restrict engv,
    const int eflag, const int vflag, const int inum,
    const int nbor_pitch, const int t_per_atom,
@ -344,7 +344,8 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f, fO;
+  acctyp3 f;
+  acctyp4 fO;
  f.x=(acctyp)0;  f.y=(acctyp)0;  f.z=(acctyp)0;
  fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
  acctyp energy, e_coul, virial[6], vO[6];
@ -386,6 +387,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj,factor_coul;
@ -470,7 +472,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
            e_coul += prefactor*(_erfc-factor_coul);
          }
          if (EVFLAG && vflag) {
-            acctyp4 fd;
+            acctyp3 fd;
            fd.x = delx*force_coul;
            fd.y = dely*force_coul;
            fd.z = delz*force_coul;
@ -645,7 +647,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
    const __global numtyp *restrict sp_lj_in,
    const __global int * dev_nbor,
    const __global int * dev_packed,
-    __global acctyp4 *restrict ans,
+    __global acctyp3 *restrict ans,
    __global acctyp *restrict engv,
    const int eflag, const int vflag, const int inum,
    const int nbor_pitch, const int t_per_atom,
@ -674,7 +676,8 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
    if (EVFLAG && eflag)
      lj3[tid]=lj3_in[tid];
  }
-  acctyp4 f, fO;
+  acctyp3 f;
+  acctyp4 fO;
  f.x=(acctyp)0;  f.y=(acctyp)0;  f.z=(acctyp)0;
  fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
  acctyp energy, e_coul, virial[6], vO[6];
@ -717,6 +720,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj,factor_coul;
@ -801,7 +805,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
            e_coul += prefactor*(_erfc-factor_coul);
          }
          if (EVFLAG && vflag) {
-            acctyp4 fd;
+            acctyp3 fd;
            fd.x = delx*force_coul;
            fd.y = dely*force_coul;
            fd.z = delz*force_coul;
--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@ -31,7 +31,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
                    const __global numtyp *restrict sp_lj_in,
                    const __global int *dev_nbor,
                    const __global int *dev_packed,
-                    __global acctyp4 *restrict ans,
+                    __global acctyp3 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
                    const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -119,7 +120,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_lj_in,
                         const __global int *dev_nbor,
                         const __global int *dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
@ -139,7 +140,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
    mie3[tid]=mie3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -161,6 +162,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@ -33,7 +33,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict sp_lj_in,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
-                      __global acctyp4 *restrict ans,
+                      __global acctyp3 *restrict ans,
                      __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
@ -49,7 +49,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -68,6 +68,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -120,7 +121,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
@ -141,7 +142,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
      mor2[tid]=mor2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -163,6 +164,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -52,7 +52,7 @@ _texture_2d( pos_tex,int4);
  compute the id of the cell where the atoms belong to
 x: atom coordinates
 cell_id: cell ids
-particle_id: 
+particle_id:
 boxlo[0-2]: the lower left corner of the local box
 ncell[xyz]: the number of cells in xyz dims
 i_cell_size is the inverse cell size
@ -489,6 +489,10 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,

 #endif

+#define SPECIAL_DATA_PRELOAD_SIZE 3
+#define UNROLL_FACTOR_LIST 4
+#define UNROLL_FACTOR_SPECIAL 2
+
 __kernel void kernel_special(__global int *dev_nbor,
                             __global int *host_nbor_list,
                             const __global int *host_numj,
@ -503,46 +507,95 @@ __kernel void kernel_special(__global int *dev_nbor,

  if (ii<nt) {
    int stride;
-    __global int *list, *list_end;
+    __global int *list;

    int n1=nspecial[ii*3];
    int n2=nspecial[ii*3+1];
    int n3=nspecial[ii*3+2];

-    int numj;
+    int myj;
    if (ii < inum) {
      stride=inum;
      list=dev_nbor+stride+ii;
-      numj=*list;
+      int numj=*list;
      list+=stride+fast_mul(ii,t_per_atom-1);
      stride=fast_mul(inum,t_per_atom);
-      int njt=numj/t_per_atom;
-      list_end=list+fast_mul(njt,stride)+(numj & (t_per_atom-1));
+      myj=numj/t_per_atom;
+      if (offset < (numj & (t_per_atom-1)))
+        myj++;
      list+=offset;
    } else {
      stride=1;
      list=host_nbor_list+(ii-inum)*max_nbors;
-      numj=host_numj[ii-inum];
-      list_end=list+fast_mul(numj,stride);
+      myj=host_numj[ii-inum];
    }

-    for ( ; list<list_end; list+=stride) {
-      int nbor=*list;
-      tagint jtag=tag[nbor];
+#if SPECIAL_DATA_PRELOAD_SIZE > 0
+    tagint special_preload[SPECIAL_DATA_PRELOAD_SIZE];
+    for (int i = 0, j = 0; (i < n3) && (j < SPECIAL_DATA_PRELOAD_SIZE); i+=UNROLL_FACTOR_SPECIAL, j++) {
+      special_preload[j] = special[ii + i*nt];
+    }
+#endif

-      int offset=ii;
-      for (int i=0; i<n3; i++) {
-        if (special[offset]==jtag) {
-          int which = 1;
-          if (i>=n1)
-            which++;
-          if (i>=n2)
-            which++;
-          nbor=nbor ^ (which << SBBITS);
-          *list=nbor;
-        }
-        offset+=nt;
+    for (int m=0; m<myj; m+=UNROLL_FACTOR_LIST) {
+      int nbor[UNROLL_FACTOR_LIST];
+      tagint jtag[UNROLL_FACTOR_LIST];
+      __global int* list_addr[UNROLL_FACTOR_LIST];
+      int lmax = myj - m;
+      for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+        list_addr[l] = list + l*stride;
+        if (l < lmax)
+          nbor[l] = *list_addr[l];
      }
+      for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+        if (l < lmax)
+          jtag[l] = tag[nbor[l]];
+      }
+
+      for (int i=0, j=0; i<n3; i+=UNROLL_FACTOR_SPECIAL, j++) {
+        tagint special_data[UNROLL_FACTOR_SPECIAL];
+        int which[UNROLL_FACTOR_SPECIAL];
+
+        for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
+          which[c] = 1;
+          if (i + c < n3)
+          {
+#if SPECIAL_DATA_PRELOAD_SIZE > 0
+            if ((c == 0) && (j < SPECIAL_DATA_PRELOAD_SIZE)) {
+              special_data[c] = special_preload[j];
+            }
+            else
+#endif
+              special_data[c] = special[ii + (i+c)*nt];
+          }
+        }
+
+        for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
+          if (i+k >= n1) {
+            which[k]++;
+          }
+        }
+        for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
+          if (i+k >= n2) {
+            which[k]++;
+          }
+          which[k] <<= SBBITS;
+        }
+        for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
+          if (i + c < n3) {
+            for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+              if (l < lmax && special_data[c] == jtag[l]) {
+                nbor[l]=nbor[l] ^ which[c];
+              }
+            }
+          }
+        }
+      }
+      for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+        if (l < lmax)
+          *list_addr[l] = nbor[l];
+      }
+      list+=UNROLL_FACTOR_LIST * stride;
    }
  } // if ii
 }
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@ -217,7 +217,7 @@ __kernel void interp(const __global numtyp4 *restrict x_,
                     const grdtyp delxinv,  const grdtyp delyinv,
                     const grdtyp delzinv, const int order,
                     const int order2, const grdtyp qqrd2e_scale,
-                     __global acctyp4 *restrict ans) {
+                     __global acctyp3 *restrict ans) {
  __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
  __local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
  __local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
@ -239,7 +239,7 @@ __kernel void interp(const __global numtyp4 *restrict x_,
    fetch(qs,ii,q_tex);
    qs*=qqrd2e_scale;

-    acctyp4 ek;
+    acctyp3 ek;
    ek.x=(acctyp)0.0;
    ek.y=(acctyp)0.0;
    ek.z=(acctyp)0.0;
--- a/lib/gpu/lal_pre_cuda_hip.h
+++ b/lib/gpu/lal_pre_cuda_hip.h
@ -57,6 +57,7 @@
 #define MAX_SHARED_TYPES 11
 #define MAX_BIO_SHARED_TYPES 128
 #define PPPM_MAX_SPLINE 8
+#define NBOR_PREFETCH 0

 // -------------------------------------------------------------------------
 //                              KERNEL MACROS
--- a/lib/gpu/lal_pre_ocl_config.h
+++ b/lib/gpu/lal_pre_ocl_config.h
@ -23,7 +23,7 @@
 //   THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
 //   BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
 //   BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
-//   PPPM_MAX_SPLINE}
+//   PPPM_MAX_SPLINE, NBOR_PREFETCH}
 //
 //*************************************************************************/

@ -39,15 +39,15 @@ const char * ocl_config_names[] =
  };
 const char * ocl_config_strings[] =
  {
-   "GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8",
-   "NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8",
-   "AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8",
+   "GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8,0",
+   "NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8,0",
+   "AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8,0",
 #ifdef _SINGLE_SINGLE
-   "INTEL_GPU,500,8,16,1,1,4,8,1,64,64,64,64,64,8,128,8,128,8",
-   "APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8",
+   "INTEL_GPU,500,8,32,1,1,4,8,2,128,128,128,128,64,8,128,8,128,8,2",
+   "APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8,0",
 #else
-   "INTEL_GPU,500,8,16,1,1,2,8,1,64,64,64,64,64,8,128,8,128,8",
-   "APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8",
+   "INTEL_GPU,500,8,32,1,1,2,8,2,128,128,128,128,64,8,128,8,128,8,2",
+   "APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8,0",
 #endif
-   "INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8"
+   "INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8,0"
  };
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@ -57,6 +57,10 @@ struct _lgpu_float2 {
  float x; float y;
 };

+struct _lgpu_float3 {
+  float x; float y; float z;
+};
+
 struct _lgpu_float4 {
  float x; float y; float z; float w;
 };
@ -65,6 +69,10 @@ struct _lgpu_double2 {
  double x; double y;
 };

+struct _lgpu_double3 {
+  double x; double y; double z;
+};
+
 struct _lgpu_double4 {
  double x; double y; double z; double w;
 };
@ -75,6 +83,11 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) {
  return out;
 }

+inline std::ostream & operator<<(std::ostream &out, const _lgpu_float3 &v) {
+  out << v.x << " " << v.y << " " << v.z;
+  return out;
+}
+
 inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) {
  out << v.x << " " << v.y << " " << v.z;
  return out;
@ -85,6 +98,11 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) {
  return out;
 }

+inline std::ostream & operator<<(std::ostream &out, const _lgpu_double3 &v) {
+  out << v.x << " " << v.y << " " << v.z;
+  return out;
+}
+
 inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
  out << v.x << " " << v.y << " " << v.z;
  return out;
@ -97,8 +115,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define PRECISION float
 #define ACC_PRECISION double
 #define numtyp2 _lgpu_float2
+#define numtyp3 _lgpu_float3
 #define numtyp4 _lgpu_float4
 #define acctyp2 _lgpu_double2
+#define acctyp3 _lgpu_double3
 #define acctyp4 _lgpu_double4
 #endif

@ -107,8 +127,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define PRECISION double
 #define ACC_PRECISION double
 #define numtyp2 _lgpu_double2
+#define numtyp3 _lgpu_double3
 #define numtyp4 _lgpu_double4
 #define acctyp2 _lgpu_double2
+#define acctyp3 _lgpu_double3
 #define acctyp4 _lgpu_double4
 #endif

@ -117,8 +139,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define PRECISION float
 #define ACC_PRECISION float
 #define numtyp2 _lgpu_float2
+#define numtyp3 _lgpu_float3
 #define numtyp4 _lgpu_float4
 #define acctyp2 _lgpu_float2
+#define acctyp3 _lgpu_float3
 #define acctyp4 _lgpu_float4
 #endif

--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -93,6 +93,13 @@
 //     Definition:   Maximum order for splines in PPPM
 //     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
 //
+//  NBOR_PREFETCH
+//     Definition:   Control use of prefetch for neighbor indices
+//                   0 = No prefetch
+//                   1 = Prefetch using standard API
+//                   2 = Prefetch using Intel intrinsics
+//     Restrictions: NBOR_PREFETCH forced to 0 when LAL_DISABLE_PREFETCH
+//                   is defined in library build
 //*************************************************************************/

 // -------------------------------------------------------------------------
@ -101,6 +108,7 @@

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_pre_cuda_hip.h"
+#define ucl_prefetch(p)
 #define ucl_pow pow
 #endif

@ -285,6 +293,55 @@
  #define simd_size() SIMD_SIZE
 #endif

+// -------------------------------------------------------------------------
+//                      OPENCL KERNEL MACROS - PREFETCH
+// -------------------------------------------------------------------------
+
+#if (NBOR_PREFETCH == 0)
+#define ucl_prefetch(p)
+#endif
+
+#if (NBOR_PREFETCH == 1)
+inline void ucl_prefetch(const __global int *p) {
+  prefetch(p, 1);
+}
+#endif
+
+#if (NBOR_PREFETCH == 2)
+// Load message caching control
+enum LSC_LDCC {
+  LSC_LDCC_DEFAULT,
+  LSC_LDCC_L1UC_L3UC,   //1 Override to L1 uncached and L3 uncached
+  LSC_LDCC_L1UC_L3C,    //1 Override to L1 uncached and L3 cached
+  LSC_LDCC_L1C_L3UC,    //1 Override to L1 cached and L3 uncached
+  LSC_LDCC_L1C_L3C,     //1 Override to L1 cached and L3 cached
+  LSC_LDCC_L1S_L3UC,    //1 Override to L1 streaming load and L3 uncached
+  LSC_LDCC_L1S_L3C,     //1 Override to L1 streaming load and L3 cached
+  LSC_LDCC_L1IAR_L3C,   //1 Override to L1 invalidate-after-read, and L3 cached
+};
+
+void __builtin_IB_lsc_prefetch_global_uint(const __global uint *base,
+                                           int elemOff,
+                                           enum LSC_LDCC cacheOpt); //D32V1
+
+inline void ucl_prefetch(const __global int *p) {
+  __builtin_IB_lsc_prefetch_global_uint((const __global uint *)p, 0,
+                                        LSC_LDCC_L1C_L3UC);
+}
+#endif
+
+struct _lgpu_float3 {
+  float x; float y; float z;
+};
+struct _lgpu_double3 {
+  double x; double y; double z;
+};
+#ifdef _SINGLE_SINGLE
+#define acctyp3 struct _lgpu_float3
+#else
+#define acctyp3 struct _lgpu_double3
+#endif
+
 // -------------------------------------------------------------------------
 //                            END OPENCL DEFINITIONS
 // -------------------------------------------------------------------------
@ -301,6 +358,9 @@
 #define numtyp4 double4
 #define acctyp double
 #define acctyp2 double2
+#ifndef acctyp3
+#define acctyp3 double3
+#endif
 #define acctyp4 double4
 #endif

@ -310,6 +370,9 @@
 #define numtyp4 float4
 #define acctyp double
 #define acctyp2 double2
+#ifndef acctyp3
+#define acctyp3 double3
+#endif
 #define acctyp4 double4
 #endif

@ -319,6 +382,9 @@
 #define numtyp4 float4
 #define acctyp float
 #define acctyp2 float2
+#ifndef acctyp3
+#define acctyp3 float3
+#endif
 #define acctyp4 float4
 #endif

--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@ -32,6 +32,9 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
  return ans;
 }

+#ifdef INTEL_OCL
+__attribute__((intel_reqd_sub_group_size(16)))
+#endif
 __kernel void k_resquared(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict q,
                          const __global numtyp4 *restrict shape,
@ -41,7 +44,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
                          const int ntypes,
                          const __global int *dev_nbor,
                          const int stride,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          const int astride,
                          __global acctyp *restrict engv,
                          __global int *restrict err_flag,
@ -62,7 +65,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
  const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0;
  const numtyp cr60=ucl_cbrt((numtyp)60.0);

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, virial[6];
@ -122,6 +125,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
--- a/lib/gpu/lal_re_squared_ext.cpp
+++ b/lib/gpu/lal_re_squared_ext.cpp
@ -105,28 +105,32 @@ void re_gpu_clear() {
                tagint **special, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
-                double **host_quat);
+                const int *ellipsoid, const EllipsoidBonus *bonus);

 int** re_gpu_compute_n(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, double *sublo,
-                       double *subhi, tagint *tag, int **nspecial, tagint **special,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start, int **ilist,
-                       int **jnum, const double cpu_time, bool &success,
-                       double **host_quat) {
+                       double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum, const double cpu_time,
+                       bool &success, const int *ellipsoid,
+                       const void *bonus) {
  return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
                      tag, nspecial, special, eflag, vflag, eatom, vatom,
-                      host_start, ilist, jnum, cpu_time, success, host_quat);
+                      host_start, ilist, jnum, cpu_time, success, ellipsoid,
+                      static_cast<const EllipsoidBonus *>(bonus));
 }

 int * re_gpu_compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, int *ilist, int *numj,
                     int **firstneigh, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, double **host_quat) {
+                     const double cpu_time, bool &success,
+                     const int *ellipsoid, const void *bonus) {
  return REMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
                      numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
-                      cpu_time, success, host_quat);
+                      cpu_time, success, ellipsoid,
+                      static_cast<const EllipsoidBonus *>(bonus));
 }

 // ---------------------------------------------------------------------------
@ -135,4 +139,3 @@ int * re_gpu_compute(const int ago, const int inum_full, const int nall,
 double re_gpu_bytes() {
  return REMF.host_memory_usage();
 }
-
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@ -86,7 +86,7 @@
        ap1+=astride;                                                        \
      }                                                                      \
    }                                                                        \
-    acctyp4 old=ans[ii];                                                     \
+    acctyp3 old=ans[ii];                                                     \
    old.x+=f.x;                                                              \
    old.y+=f.y;                                                              \
    old.z+=f.z;                                                              \
@ -131,7 +131,7 @@
        ap1+=astride;                                                       \
      }                                                                     \
    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -154,7 +154,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
                                           const int ntypes,
                                           const __global int *dev_nbor,
                                           const int stride,
-                                           __global acctyp4 *restrict ans,
+                                           __global acctyp3 *restrict ans,
                                           const int astride,
                                           __global acctyp *restrict engv,
                                           __global int *restrict err_flag,
@ -180,7 +180,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
  const numtyp solv_f_r =
     (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, virial[6];
@ -216,6 +216,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -409,7 +410,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
                                           const int ntypes,
                                           const __global int *dev_nbor,
                                           const int stride,
-                                           __global acctyp4 *restrict ans,
+                                           __global acctyp3 *restrict ans,
                                           __global acctyp *restrict engv,
                                           __global int *restrict err_flag,
                                           const int eflag, const int vflag,
@ -435,7 +436,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
  const numtyp solv_f_r =
    (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -454,6 +455,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
      int i=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(i)];
      i &= NEIGHMASK;
@ -610,7 +612,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict gum,
                             const int stride,
                             const __global int *dev_ij,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             __global int *restrict err_flag,
                             const int eflag, const int vflag, const int start,
@ -628,7 +630,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
  sp_lj[2]=gum[2];
  sp_lj[3]=gum[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -647,6 +649,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);

      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -697,7 +700,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict gum,
                                  const int stride,
                                  const __global int *dev_ij,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  __global int *restrict err_flag,
                                  const int eflag, const int vflag,
@ -721,7 +724,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -743,6 +746,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);

      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_soft.cu
+++ b/lib/gpu/lal_soft.cu
@ -32,7 +32,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@ -48,7 +48,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -67,6 +67,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -119,7 +120,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -137,7 +138,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
    coeff[tid]=coeff_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -159,6 +160,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@ -57,7 +57,7 @@ _texture( sw3_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -116,7 +116,7 @@ _texture( sw3_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -194,7 +194,7 @@ _texture( sw3_tex,int4);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -265,7 +265,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
                   const __global numtyp4 * restrict c_14,
                   const __global numtyp2 * restrict c_56,
                   const int ntypes, const __global int * dev_nbor,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
                   const int nbor_pitch, const int t_per_atom,
@ -282,7 +282,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
  if (EVFLAG && eflag) pre_sw_c56=c_56[ONETYPE];
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -461,7 +461,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
                                const __global numtyp2 *restrict sw_pre3,
                                const int ntypes,
                                const __global int * dev_nbor,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag,
                                const int inum,  const int nbor_pitch,
@ -480,7 +480,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
  const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -579,7 +579,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                             const __global numtyp2 *restrict sw_pre3,
                             const int ntypes, const __global int * dev_nbor,
                             const __global int * dev_ilist,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum,  const int nbor_pitch,
@ -598,7 +598,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
  const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -701,7 +701,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                             const __global numtyp2 *restrict sw_pre3,
                             const int ntypes, const __global int * dev_nbor,
                             const __global int * dev_ilist,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum,  const int nbor_pitch,
@ -720,7 +720,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
  const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@ -49,7 +49,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict sp_lj_in,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
-                      __global acctyp4 *restrict ans,
+                      __global acctyp3 *restrict ans,
                      __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom,
@ -66,7 +66,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -87,6 +87,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -146,7 +147,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom,
@ -165,7 +166,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
    cutsq[tid]=cutsq_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -189,6 +190,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -251,7 +253,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom,
@ -268,7 +270,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -289,6 +291,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -352,7 +355,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -371,7 +374,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
    cutsq[tid]=cutsq_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -395,6 +398,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -461,7 +465,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom,
@ -478,7 +482,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -499,6 +503,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -569,7 +574,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
                                  const __global numtyp* sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                  __global acctyp4 *ans,
+                                  __global acctyp3 *ans,
                                  __global acctyp *engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -588,7 +593,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
    cutsq[tid]=cutsq_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -611,6 +616,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -686,7 +692,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
                             const __global numtyp* sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *ans,
+                             __global acctyp3 *ans,
                             __global acctyp *engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom,
@ -703,7 +709,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -724,6 +730,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -792,7 +799,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
                                  const __global numtyp* sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                  __global acctyp4 *ans,
+                                  __global acctyp3 *ans,
                                  __global acctyp *engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -811,7 +818,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
    cutsq[tid]=cutsq_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -835,6 +842,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -63,7 +63,7 @@ _texture_2d( pos_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -132,7 +132,7 @@ _texture_2d( pos_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -211,7 +211,7 @@ _texture_2d( pos_tex,int4);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -448,7 +448,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
                                  const __global int *restrict elem2param,
                                  const int nelements, const int nparams,
                                  const __global int * dev_nbor,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -472,7 +472,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
  const numtyp ijparam_bigd = ts2_in[ONETYPE3].w;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -553,7 +553,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
                                     const __global acctyp2 *restrict zetaij,
                                     const __global acctyp *restrict zetaij_e,
                                     const __global int * dev_nbor,
-                                     __global acctyp4 *restrict ans,
+                                     __global acctyp3 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
                                     const int inum,  const int nbor_pitch,
@ -585,7 +585,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
  const numtyp gamma = ts4_in[ONETYPE3].w;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -728,7 +728,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp *restrict zetaij_e,
                                  const __global int * dev_nbor,
                                  const __global int * dev_ilist,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
@ -760,7 +760,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
  const numtyp gamma = ts4_in[ONETYPE3].w;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -950,7 +950,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                    const __global acctyp *restrict zetaij_e,
                                    const __global int * dev_nbor,
                                    const __global int * dev_ilist,
-                                    __global acctyp4 *restrict ans,
+                                    __global acctyp3 *restrict ans,
                                    __global acctyp *restrict engv,
                                    const int eflag, const int vflag,
                                    const int inum,  const int nbor_pitch,
@ -982,7 +982,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
  const numtyp gamma = ts4_in[ONETYPE3].w;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@ -63,7 +63,7 @@ _texture_2d( pos_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -131,7 +131,7 @@ _texture_2d( pos_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -209,7 +209,7 @@ _texture_2d( pos_tex,int4);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -417,7 +417,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
                                  const __global int *restrict elem2param,
                                  const int nelements, const int nparams,
                                  const __global int * dev_nbor,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -434,7 +434,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
    ts2[tid]=ts2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -511,7 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
                                     const int nelements, const int nparams,
                                     const __global acctyp4 *restrict zetaij,
                                     const __global int * dev_nbor,
-                                     __global acctyp4 *restrict ans,
+                                     __global acctyp3 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
                                     const int inum,  const int nbor_pitch,
@ -535,7 +535,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
    ts5[tid]=ts5_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -676,7 +676,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_ilist,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
@ -700,7 +700,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
    ts5[tid]=ts5_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -890,7 +890,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_ilist,
-                                        __global acctyp4 *restrict ans,
+                                        __global acctyp3 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
                                        const int inum,  const int nbor_pitch,
@ -914,7 +914,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
    ts5[tid]=ts5_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@ -81,7 +81,7 @@ _texture( ts6_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -149,7 +149,7 @@ _texture( ts6_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -227,7 +227,7 @@ _texture( ts6_tex,int4);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -443,7 +443,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
                                  const __global int *restrict elem2param,
                                  const int nelements, const int nparams,
                                  const __global int * dev_nbor,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -462,7 +462,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
    ts6[tid]=ts6_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -544,7 +544,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
                                     const int nelements, const int nparams,
                                     const __global acctyp4 *restrict zetaij,
                                     const __global int * dev_nbor,
-                                     __global acctyp4 *restrict ans,
+                                     __global acctyp3 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
                                     const int inum,  const int nbor_pitch,
@ -566,7 +566,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
    ts4[tid]=ts4_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -703,7 +703,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_ilist,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
@ -725,7 +725,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
    ts4[tid]=ts4_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -908,7 +908,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_ilist,
-                                        __global acctyp4 *restrict ans,
+                                        __global acctyp3 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
                                        const int inum,  const int nbor_pitch,
@ -930,7 +930,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
    ts4[tid]=ts4_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
--- a/lib/gpu/lal_ufm.cu
+++ b/lib/gpu/lal_ufm.cu
@ -33,7 +33,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
                   const __global numtyp *restrict sp_lj,
                   const __global int * dev_nbor,
                   const __global int * dev_packed,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
                   const int nbor_pitch, const int t_per_atom) {
@ -43,7 +43,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -61,6 +61,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -109,7 +110,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
                        const __global numtyp *restrict sp_lj_in,
                        const __global int * dev_nbor,
                        const __global int * dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
@ -130,7 +131,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
      uf3[tid]=uf3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -151,6 +152,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@ -73,7 +73,7 @@ _texture( param5_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -132,7 +132,7 @@ _texture( param5_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -210,7 +210,7 @@ _texture( param5_tex,int4);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -247,6 +247,7 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;

    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      ucl_prefetch(dev_packed+nbor+nbor_pitch);
      int sj=dev_packed[nbor];
      int j = sj & NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -283,7 +284,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
                   const __global int *restrict elem2param,
                   const int nelements,
                   const __global int * dev_packed,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
                   const int nbor_pitch, const int ev_stride) {
@ -291,7 +292,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,

  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -313,6 +314,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
    itype=map[itype];

    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      ucl_prefetch(dev_packed+nbor+nbor_pitch);

      int j=dev_packed[nbor];
      j &= NEIGHMASK;
@ -489,7 +491,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
                                const __global int *restrict elem2param,
                                const int nelements,
                                const __global int * dev_nbor,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag,
                                const int inum,  const int nbor_pitch,
@ -504,7 +506,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,

  local_allocate_store_three();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -612,7 +614,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_ilist,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum,  const int nbor_pitch,
@ -627,7 +629,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,

  local_allocate_store_three();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -743,7 +745,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_ilist,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum,  const int nbor_pitch,
@ -758,7 +760,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,

  local_allocate_store_three();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@ -30,7 +30,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
                       const __global numtyp *restrict sp_lj_in,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
-                       __global acctyp4 *restrict ans,
+                       __global acctyp3 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
                       const int nbor_pitch, const int t_per_atom) {
@ -46,7 +46,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -65,6 +65,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -118,7 +119,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
@ -136,7 +137,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
    coeff[tid]=coeff_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -158,6 +159,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@ -40,7 +40,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch, const int t_per_atom,
@ -57,7 +57,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -77,6 +77,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -131,7 +132,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
                                    const __global numtyp *restrict sp_lj_in,
                                    const __global int *dev_nbor,
                                    const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans,
+                                    __global acctyp3 *restrict ans,
                                    __global acctyp *restrict engv,
                                    const int eflag, const int vflag,
                                    const int inum, const int nbor_pitch,
@ -150,7 +151,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
    coeff[tid]=coeff_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -173,6 +174,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/Show More
+++ b/Show More