diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index cda4d383b5..618ffb0106 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const {
     bytes+=sizeof(numtyp);
   if (_vel)
     bytes+=4*sizeof(numtyp);
+  if (_extra_fields>0)
+    bytes+=_extra_fields*sizeof(numtyp);
   return bytes;
 }
 
@@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) {
                                    UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=v.device.row_bytes();
   }
+  if (_extra_fields>0 && _host_view==false) {
+    success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=extra.device.row_bytes();
+  }
 
   if (_gpu_nbor>0) {
     if (_bonds) {
@@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) {
 
 template <class numtyp, class acctyp>
 bool AtomT::add_fields(const bool charge, const bool rot,
-                       const int gpu_nbor, const bool bonds, const bool vel) {
+                       const int gpu_nbor, const bool bonds, const bool vel,
+                       const int extra_fields) {
   bool success=true;
   // Ignore host/device transfers?
   int gpu_bytes=0;
@@ -191,6 +199,16 @@ bool AtomT::add_fields(const bool charge, const bool rot,
     }
   }
 
+  if (extra_fields > 0 && _extra_fields==0) {
+    _extra_fields=extra_fields;
+    _other=true;
+    if (_host_view==false) {
+      success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
+                                     UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=extra.device.row_bytes();
+    }
+  }
+
   if (bonds && _bonds==false) {
     _bonds=true;
     if (_bonds && _gpu_nbor>0) {
@@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,
 
 template <class numtyp, class acctyp>
 bool AtomT::init(const int nall, const bool charge, const bool rot,
-                 UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) {
+                 UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel,
+                 const int extra_fields) {
   clear();
 
   bool success=true;
@@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
   _q_avail=false;
   _quat_avail=false;
   _v_avail=false;
+  _extra_avail=false;
   _resized=false;
   _gpu_nbor=gpu_nbor;
   _bonds=bonds;
   _charge=charge;
   _rot=rot;
   _vel=vel;
-  _other=_charge || _rot || _vel;
+  _extra_fields=extra_fields;
+  _other=_charge || _rot || _vel || (extra_fields>0);
   dev=&devi;
   _time_transfer=0;
 
@@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
   time_q.init(*dev);
   time_quat.init(*dev);
   time_vel.init(*dev);
+  time_extra.init(*dev);
+
   time_pos.zero();
   time_q.zero();
   time_quat.zero();
   time_vel.zero();
+  time_extra.zero();
+
   _time_cast=0.0;
 
   #ifdef GPU_CAST
@@ -308,6 +333,8 @@ void AtomT::clear_resize() {
     quat.clear();
   if (_vel)
     v.clear();
+  if (_extra_fields>0)
+    extra.clear();
 
   dev_cell_id.clear();
   dev_particle_id.clear();
@@ -350,6 +377,7 @@ void AtomT::clear() {
   time_q.clear();
   time_quat.clear();
   time_vel.clear();
+  time_extra.clear();
   clear_resize();
 
   #ifdef GPU_CAST
@@ -370,6 +398,8 @@ double AtomT::host_memory_usage() const {
     atom_bytes+=4;
   if (_vel)
     atom_bytes+=4;
+  if (_extra_fields>0)
+    atom_bytes+=_extra_fields;
   return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }
 
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index 3cf97d94a0..ff335fffa9 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -76,7 +76,7 @@ class Atom {
     *        gpu_nbor 2 if binning on host and neighboring on device **/
   bool init(const int nall, const bool charge, const bool rot,
             UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
-            const bool vel=false);
+            const bool vel=false, const int extra_fields=0);
 
   /// Check if we have enough device storage and realloc if not
   /** Returns true if resized with any call during this timestep **/
@@ -96,7 +96,7 @@ class Atom {
     *        gpu_nbor 1 if neighboring will be performed on device
     *        gpu_nbor 2 if binning on host and neighboring on device **/
   bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
-                  const bool bonds, const bool vel=false);
+                  const bool bonds, const bool vel=false, const int extra_fields=0);
 
   /// Returns true if GPU is using charges
   bool charge() { return _charge; }
@@ -107,6 +107,9 @@ class Atom {
   /// Returns true if GPU is using velocities
   bool velocity() { return _vel; }
 
+  /// Returns true if GPU is using extra fields
+  bool using_extra() { return _extra_fields; }
+
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
 
@@ -450,6 +453,38 @@ class Atom {
     add_v_data(host_ptr,host_tag);
   }
 
+ // Cast extras to write buffer
+  template<class cpytyp>
+  inline void cast_extra_data(cpytyp *host_ptr) {
+    if (_extra_avail==false) {
+      double t=MPI_Wtime();
+      if (_host_view) {
+        extra.host.view((numtyp*)host_ptr,_nall*_extra_fields,*dev);
+        extra.device.view(extra.host);
+      } else if (sizeof(numtyp)==sizeof(double))
+        memcpy(extra.host.begin(),host_ptr,_nall*_extra_fields*sizeof(numtyp));
+      else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
+        for (int i=0; i<_nall*_extra_fields; i++) extra[i]=host_ptr[i];
+      _time_cast+=MPI_Wtime()-t;
+    }
+  }
+
+  // Copy extras to device
+  /** Copies nall()*_extra elements **/
+  inline void add_extra_data() {
+    time_extra.start();
+    if (_extra_avail==false) {
+      extra.update_device(_nall*_extra_fields,true);
+      _extra_avail=true;
+    }
+    time_extra.stop();
+  }
+
   /// Add in casting time from additional data (seconds)
   inline void add_cast_time(double t) { _time_cast+=t; }
 
@@ -473,6 +508,8 @@ class Atom {
   UCL_Vector<numtyp,numtyp> quat;
   /// Velocities
   UCL_Vector<numtyp,numtyp> v;
+  /// Extras
+  UCL_Vector<numtyp,numtyp> extra;
 
   #ifdef GPU_CAST
   UCL_Vector<double,double> x_cast;
@@ -493,7 +530,7 @@ class Atom {
   UCL_H_Vec<int> host_particle_id;
 
   /// Device timers
-  UCL_Timer time_pos, time_q, time_quat, time_vel;
+  UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra;
 
   /// Geryon device
   UCL_Device *dev;
@@ -508,11 +545,12 @@ class Atom {
   bool _compiled;
 
   // True if data has been copied to device already
-  bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
+  bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized;
 
   bool alloc(const int nall);
 
   bool _allocated, _rot, _charge, _bonds, _vel, _other;
+  int _extra_fields;
   int _max_atoms, _nall, _gpu_nbor;
   bool _host_view;
   double _time_cast, _time_transfer;
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index e2b5b9cdb5..8908f3aff7 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -424,7 +424,7 @@ template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                   const bool rot, const int nlocal,
                   const int nall, const int maxspecial,
-                  const bool vel) {
+                  const bool vel, const int extra_fields) {
   if (!_device_init)
     return -1;
   if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
@@ -453,7 +453,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
 
   if (_init_count==0) {
     // Initialize atom and nbor data
-    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
+    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields))
       return -3;
 
     _data_in_estimate++;
@@ -463,6 +463,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
       _data_in_estimate++;
     if (vel)
       _data_in_estimate++;
+    if (extra_fields>0)
+      _data_in_estimate++;
+
   } else {
     if (atom.charge()==false && charge)
       _data_in_estimate++;
@@ -470,7 +473,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
       _data_in_estimate++;
     if (atom.velocity()==false && vel)
       _data_in_estimate++;
-    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
+      if (atom.using_extra()==false && extra_fields>0)
+      _data_in_estimate++;
+    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields))
       return -3;
   }
 
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 1db6ae3127..01d3d64627 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -61,6 +61,7 @@ class Device {
     * \param nall Total number of local+ghost particles
     * \param maxspecial Maximum mumber of special bonded atoms per atom
     * \param vel True if velocities need to be stored
+    * \param extra_fields Nonzero if extra fields need to be stored
     *
     * Returns:
     * -  0 if successful
@@ -70,7 +71,7 @@ class Device {
     * - -5 Double precision is not supported on card **/
   int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
            const int nlocal, const int nall, const int maxspecial,
-           const bool vel=false);
+           const bool vel=false, const int extra_fields=0);
 
   /// Initialize the device for Atom storage only
   /** \param nlocal Total number of local particles to allocate memory for
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 12cf6345c2..2ef8af0911 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -330,6 +330,10 @@
 #define NEIGHMASK 0x3FFFFFFF
 ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
 
+#define SBBITS15 29
+#define NEIGHMASK15 0x1FFFFFFF
+ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; };
+
 // default to 32-bit smallint and other ints, 64-bit bigint:
 // same as defined in src/lmptype.h
 #if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \