feat(gpu/pair): 添加 Hooke 颗粒力模型的 GPU 加速支持

实现了颗粒间 Hooke 力模型的 GPU 加速计算，主要包含以下内容： * 引入基础类 BaseSphere 处理球形颗粒的位置、半径和角速度数据 * 新增 GranHooke 类实现法向/切向弹性力与阻尼力的 GPU 计算逻辑 * 使用纹理内存优化颗粒属性访问（半径、质量、速度、角速度） * 实现设备端邻居列表构建和并行力计算内核 * 支持与 LAMMPS 的 FixNeighHistory 集成处理接触历史数据 * 添加 GPU 与 CPU 数据转换接口，支持混合粒子分组和冻结状态处理 * 整合到 LAMMPS 的 GPU 模块，可通过 pair_style gran/hooke/gpu 调用
2025-07-07 14:37:56 +08:00
parent 63fcdb6e52
commit 6dcfc82be5
8 changed files with 2298 additions and 0 deletions
--- a/lib/gpu/lal_base_sphere.cpp
+++ b/lib/gpu/lal_base_sphere.cpp
@ -0,0 +1,331 @@
+/***************************************************************************
+                                base_sphere.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Base class for pair styles needing per-particle data for position,
+  radius, angular velocity, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include "lal_base_sphere.h"
+namespace LAMMPS_AL {
+#define BaseSphereT BaseSphere<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> global_device;
+
+template <class numtyp, class acctyp>
+BaseSphereT::BaseSphere() : _compiled(false), _max_bytes(0) {
+  device=&global_device;
+  ans=new Answer<numtyp,acctyp>();
+  nbor=new Neighbor();
+  pair_program=nullptr;
+  ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
+}
+
+template <class numtyp, class acctyp>
+BaseSphereT::~BaseSphere() {
+  delete ans;
+  delete nbor;
+  k_pair_fast.clear();
+  k_pair.clear();
+  if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
+}
+
+template <class numtyp, class acctyp>
+int BaseSphereT::bytes_per_atom_atomic(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BaseSphereT::init_atomic(const int nlocal, const int nall,
+                     const int max_nbors, const int maxspecial,
+                     const double cell_size, const double gpu_split,
+                     FILE *_screen, const void *pair_program,
+                     const char *k_name) {
+  screen=_screen;
+
+  int gpu_nbor=0;
+  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  _threads_per_atom=device->threads_per_atom();
+
+  bool charge = false;
+  bool rot = true;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  if (ucl_device!=device->gpu) _compiled=false;
+
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pair_block_size();
+  compile_kernels(*ucl_device,pair_program,k_name);
+
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
+
+  hd_balancer.init(device,gpu_nbor,gpu_split);
+
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->x,4);
+
+
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void BaseSphereT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
+}
+
+template <class numtyp, class acctyp>
+void BaseSphereT::clear_atomic() {
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
+
+  time_pair.clear();
+  hd_balancer.clear();
+
+  nbor->clear();
+  ans->clear();
+}
+
+template <class numtyp, class acctyp>
+int * BaseSphereT::reset_nbors(const int nall, const int inum, int *ilist,
+                              int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return nullptr;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+
+  return ilist;
+}
+
+template <class numtyp, class acctyp>
+void BaseSphereT::build_nbor_list(const int inum, const int host_inum,
+                                 const int nall, double **host_x,
+                                 int *host_type, double *sublo,
+                                 double *subhi, tagint *tag,
+                                 int **nspecial, tagint **special,
+                                 bool &success) {
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
+template <class numtyp, class acctyp>
+void BaseSphereT::compute(const int f_ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         int *ilist, int *numj, int **firstneigh,
+                         const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom,
+                         int &host_start, const double cpu_time,
+                         bool &success, tagint *tag, 
+                        const int nlocal,
+                         double *boxlo, double *prd) {
+  acc_timers();
+  int eflags = eflag ? 1 : 0;
+  int vflags = vflag ? 1 : 0;
+  if (eatom) eflags=2;
+  if (vatom) vflags=2;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflags) eflags=2;
+  if (vflags) vflags=2;
+  #endif
+
+  set_kernel(eflags, vflags);
+  if (inum_full==0) {
+    host_start=0;
+    resize_atom(0,nall,success);
+    zero_timers();
+    return;
+  }
+
+  int ago=hd_balancer.ago_first(f_ago);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
+  host_start=inum;
+
+  if (ago==0) {
+    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
+    if (!success)
+      return;
+  }
+
+  atom->cast_x_data(host_x,host_type);
+  hd_balancer.start_timer();
+  atom->add_x_data(host_x,host_type);
+
+  const int red_blocks=loop(eflags, vflags);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist,red_blocks);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+}
+
+template <class numtyp, class acctyp>
+int **BaseSphereT::compute(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom,
+                           int &host_start, int **ilist, int **jnum,
+                           const double cpu_time, bool &success,
+                           double *boxlo, double *prd) {
+  acc_timers();
+  int eflags = eflag ? 1 : 0;
+  int vflags = vflag ? 1 : 0;
+  if (eatom) eflags=2;
+  if (vatom) vflags=2;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflags) eflags=2;
+  if (vflags) vflags=2;
+  #endif
+
+  set_kernel(eflags, vflags);
+  if (inum_full==0) {
+    host_start=0;
+    resize_atom(0,nall,success);
+    zero_timers();
+    return nullptr;
+  }
+
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, success);
+    if (!success)
+      return nullptr;
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  
+  
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  const int red_blocks=loop(eflags, vflags);
+  ans->copy_answers(eflag,vflag,eatom,vatom,red_blocks);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+
+  return nbor->host_jlist.begin()-host_start;
+}
+
+template <class numtyp, class acctyp>
+double BaseSphereT::host_memory_usage_atomic() const {
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(BaseSphere<numtyp,acctyp>);
+}
+
+template <class numtyp, class acctyp>
+void BaseSphereT::compile_kernels(UCL_Device &dev, const void *pair_str,
+                                  const char *k) {
+  if (_compiled)
+    return;
+
+  std::string s_fast=std::string(k)+"_fast";
+  if (pair_program) delete pair_program;
+  pair_program=new UCL_Program(dev);
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_fast.set_function(*pair_program,s_fast.c_str());
+  k_pair.set_function(*pair_program,k);
+  pos_tex.get_texture(*pair_program,"pos_tex");
+
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
+  _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.has_subgroup_support()) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > (int)mx_subgroup_sz) _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+}
+
+template class BaseSphere<PRECISION,ACC_PRECISION>;
+}
--- a/lib/gpu/lal_base_sphere.h
+++ b/lib/gpu/lal_base_sphere.h
@ -0,0 +1,176 @@
+/***************************************************************************
+                                base_sphere.h
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Base class for pair styles needing per-particle data for position,
+  radius, angular velocity, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_BASE_SPHERE_H
+#define LAL_BASE_SPHERE_H
+
+#include "lal_device.h"
+#include "lal_balance.h"
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+#include "geryon/ocl_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BaseSphere {
+ public:
+  BaseSphere();
+  virtual ~BaseSphere();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * \param k_name name for the kernel for force calculation
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen,
+           const void *pair_program, const char *k_name);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead(const int add_kernels=0);
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success)) {
+      pos_tex.bind_float(atom->x,4);
+    }
+    ans->resize(inum,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  inline void resize_local(const int inum, const int host_inum,
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
+  /// Clear all host and device data
+  void clear_atomic();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom_atomic(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_atomic() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      nbor->acc_timers(screen);
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
+    }
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    time_pair.zero();
+    atom->zero_timers();
+    ans->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, bool &success);
+
+  /// Pair loop with host neighboring
+  void compute(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, tagint *tag,
+               const int nlocal, double *boxlo, double *prd);
+
+  /// Pair loop with device neighboring
+  int **compute(const int ago, const int inum_full, const int nall,
+                 double **host_x, int *host_type, double *sublo,
+                 double *subhi, tagint *tag, int **nspecial,
+                 tagint **special, const bool eflag, const bool vflag,
+                 const bool eatom, const bool vatom, int &host_start,
+                 int **ilist, int **numj, const double cpu_time, bool &success,
+                 double *boxlo, double *prd);
+
+  // -------------------------- DEVICE DATA -------------------------
+
+  Device<numtyp,acctyp> *device;
+  UCL_Device *ucl_device;
+  UCL_Timer time_pair;
+  Balance<numtyp,acctyp> hd_balancer;
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+  Atom<numtyp,acctyp> *atom;
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+  Answer<numtyp,acctyp> *ans;
+
+  // --------------------------- NBOR DATA ----------------------------
+  Neighbor *nbor;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
+  inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+
+ protected:
+  bool _compiled;
+  int _block_size, _threads_per_atom;
+  double _max_bytes, _max_an_bytes;
+  double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;
+
+  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
+
+  virtual int loop(const int eflag, const int vflag) = 0;
+};
+
+}
+#endif
--- a/lib/gpu/lal_gran_hooke.cpp
+++ b/lib/gpu/lal_gran_hooke.cpp
@ -0,0 +1,391 @@
+/***************************************************************************
+ *                                 gran_hooke.cpp
+ *                              -------------------
+ *                            Trung Dac Nguyen (ORNL)
+ * 
+ *  Class for acceleration of the gran/hooke pair style.
+ * 
+ * __________________________________________________________________________
+ *    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ * __________________________________________________________________________
+ * 
+ *    begin                : 
+ *    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "gran_hooke_cl.h"
+#elif defined(USE_CUDART)
+const char *gran_hooke=0;
+#else
+#include "gran_hooke_cubin.h"
+#endif
+
+#include "lal_gran_hooke.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define GranHookeT GranHooke<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+GranHookeT::GranHooke() : BaseSphere<numtyp,acctyp>(),
+                                        _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+GranHookeT::~GranHooke() {
+  clear();
+}
+
+template <class numtyp, class acctyp>
+int GranHookeT::init(const int ntypes,
+                           double host_k_n, double host_k_t,
+                           double host_gamman, double host_gammat,
+                           double host_xmu, double host_dt, const int host_dampflag,
+                           double *host_special_lj, int *host_mask, 
+                           const int nlocal, const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *screen) {
+  int onetype=0;
+  int success;
+  success = this->init_atomic(nlocal, nall, max_nbors, maxspecial,
+                             cell_size, gpu_split, screen, 
+                             gran_hooke, "k_gran_hooke");
+  if (success != 0)
+    return success;
+  
+   _shared_view = this->ucl_device->shared_memory() && sizeof(numtyp)==sizeof(double);
+
+
+   // allocate rad
+
+  int ef_nall=nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+
+  _max_rad_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
+
+  if (!_shared_view)
+  {
+     c_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+     c_rmass.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+     c_omega.alloc(_max_rad_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+     c_vel.alloc(_max_rad_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+  }
+    
+  rad_tex.get_texture(*(this->pair_program),"rad_tex");
+  rad_tex.bind_float(c_rad,1);
+
+  rmass_tex.get_texture(*(this->pair_program),"rmass_tex");
+  rmass_tex.bind_float(c_rmass,1);
+
+  omega_tex.get_texture(*(this->pair_program),"omega_tex");
+  omega_tex.bind_float(c_omega,4);
+
+  vel_tex.get_texture(*(this->pair_program),"vel_tex");
+  vel_tex.bind_float(c_vel,4);
+
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {//TODO
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  //shared_types=false;//TEST
+  _k_n=host_k_n;
+  _k_t=host_k_t;
+  _gamman=host_gamman;
+  _gammat=host_gammat;
+  _xmu=host_xmu;
+  _dampflag=host_dampflag;
+  _dt=host_dt;
+
+  UCL_H_Vec<int> h_mask;
+  _mask.alloc(_max_rad_size,*(this->ucl_device),UCL_READ_ONLY);
+  h_mask.view(host_mask,_max_rad_size,*(this->ucl_device));
+  ucl_copy(_mask,h_mask,false);
+  
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=sp_lj.row_bytes() + _mask.row_bytes();
+  
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void GranHookeT::clear() {
+  if (!_allocated)
+    return;
+
+  _allocated = false;
+  sp_lj.clear();
+  _mask.clear();
+  c_rad.clear();
+  c_omega.clear();
+  c_rmass.clear();
+  c_vel.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double GranHookeT::host_memory_usage() const {
+  return this->host_memory_usage_atomic() + 
+         sizeof(GranHooke<numtyp,acctyp>);
+}
+
+template <class numtyp, class acctyp>
+int GranHookeT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors) + 
+         sizeof(numtyp4)*2 + sizeof(numtyp2) + sizeof(int) + 
+         sizeof(numtyp)*3 + sizeof(numtyp)*3*max_nbors;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int GranHookeT::loop(const int _eflag, const int _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX = this->block_size();
+  //int GX = static_cast<int>(ceil(static_cast<double>(this->ans->inum()) / BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX, BX);
+    this->k_pair_fast.run(&this->atom->x, &c_rad, &c_vel,
+                          &c_omega, &sp_lj, &c_rmass,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &_mask, &this->ans->force,
+                          &this->ans->engv, &_eflag, &_vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &_k_n, &_k_t, &_gamman, &_gammat, &_xmu, 
+                          &_dampflag, &freeze_group_bit,
+                          &_limit_damping, &_dt);
+    
+  } else {    
+    this->k_pair.set_size(GX, BX);
+    this->k_pair.run(&this->atom->x, &c_rad, &c_vel,
+                     &c_omega, &sp_lj, &c_rmass,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &_mask, &this->ans->force,
+                     &this->ans->engv, &_eflag, &_vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom,
+                     &_k_n, &_k_t, &_gamman, &_gammat, &_xmu, 
+                     &_dampflag, &freeze_group_bit,
+                     &_limit_damping, &_dt);
+  }
+  this->time_pair.stop();
+  return GX;
+}
+
+template <class numtyp, class acctyp>
+void GranHookeT::compute(const int f_ago, const int inum_full,
+                          const int nall, double **host_x, int *host_type,
+                          int *ilist, int *numj, int **firstneigh,
+                          const bool eflag_in, const bool vflag_in,
+                          const bool eatom, const bool vatom,
+                          int &host_start, const double cpu_time,
+                          bool &success, tagint *tag, 
+                          double **host_v, double *host_rad, 
+                          double **host_omega,  double *host_rmass, int limit_damping,
+                          const int nlocal, double *boxlo, double *prd) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+  _limit_damping = limit_damping;
+   // ------------------- Resize data array --------------------------
+  if (nall>_max_rad_size) {
+    _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    if (!_shared_view) {
+      c_rad.resize(_max_rad_size);
+      rad_tex.bind_float(c_rad,1);
+
+      c_omega.resize(_max_rad_size);
+      omega_tex.bind_float(c_omega,4);
+
+      c_vel.resize(_max_rad_size);
+      vel_tex.bind_float(c_vel,4);
+
+      c_rmass.resize(_max_rad_size);
+      rmass_tex.bind_float(c_rmass,1);
+    }
+  }
+
+  // ----------------------------------------------------------------
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    this->resize_atom(0,nall,success);
+    this->zero_timers();
+    return;
+  }
+
+  // load balance, returning the atom count on the device (inum)
+  int ago=this->hd_balancer.ago_first(f_ago);
+  //this->hd_balancer.balance(cpu_time);
+  //int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
+  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
+  this->ans->inum(inum);
+  host_start=inum;
+
+
+  // -----------------------------------------------------------------
+
+  if (ago==0) {
+    this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
+    if (!success)
+      return;
+  }
+
+  this->atom->cast_x_data(host_x,host_type);
+  this->cast_vel_data(host_v);
+  this->cast_rad_data(host_rad);
+  this->cast_omega_data(host_omega);
+  this->cast_rmass_data(host_rmass);
+  this->hd_balancer.start_timer();
+  this->atom->add_x_data(host_x,host_type);
+  this->add_vel_data();
+  this->add_rad_data();
+  this->add_rmass_data();
+  this->add_omega_data();
+
+  const int red_blocks=this->loop(eflag,vflag);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
+  this->device->add_ans_object(this->ans);
+  this->hd_balancer.stop_timer();
+
+}
+
+template <class numtyp, class acctyp>
+int** GranHookeT::compute(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom,
+                           int &host_start, int **ilist, int **jnum,
+                           const double cpu_time, bool &success,
+                           double **host_v, double *host_rad, 
+                           double **host_omega, double *host_rmass, int limit_damping,
+                           double *boxlo, double *prd) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  _limit_damping= limit_damping;
+
+   // ------------------- Resize data array --------------------------
+  if (nall>_max_rad_size) {
+    _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    if (!_shared_view) {
+      c_rad.resize(_max_rad_size);
+      rad_tex.bind_float(c_rad,1);
+
+      c_omega.resize(_max_rad_size);
+      omega_tex.bind_float(c_omega,4);
+
+      c_vel.resize(_max_rad_size);
+      vel_tex.bind_float(c_vel,4);
+
+      c_rmass.resize(_max_rad_size);
+      rmass_tex.bind_float(c_rmass,1);
+    }
+  }
+
+   // -----------------------------------------------------------------
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    this->resize_atom(0,nall,success);
+    this->zero_timers();
+    return nullptr;
+  }
+
+  // ----------------------------------------------------------------
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    this->resize_atom(0,nall,success);
+    this->zero_timers();
+    return nullptr;
+  }
+
+  // load balance, returning the atom count on the device (inum)
+  this->hd_balancer.balance(cpu_time);
+  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
+  this->ans->inum(inum);
+  host_start=inum;
+
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                          sublo, subhi, tag, nspecial, special, success);
+    if (!success)
+      return nullptr;
+    this->cast_vel_data(host_v);
+    this->cast_rad_data(host_rad);
+    this->cast_omega_data(host_omega);
+    this->cast_rmass_data(host_rmass);
+    this->hd_balancer.start_timer();
+  } else {
+    this->atom->cast_x_data(host_x,host_type);
+    this->cast_vel_data(host_v);
+    this->cast_rad_data(host_rad);
+    this->cast_omega_data(host_omega);
+    this->cast_rmass_data(host_rmass);
+    this->hd_balancer.start_timer();
+    this->atom->add_x_data(host_x,host_type);
+  }
+  this->add_vel_data();
+  this->add_rad_data();
+  this->add_rmass_data();
+  this->add_omega_data();
+  *ilist=this->nbor->host_ilist.begin();
+  *jnum=this->nbor->host_acc.begin();
+
+  const int red_blocks=this->loop(eflag,vflag);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  this->device->add_ans_object(this->ans);
+  this->hd_balancer.stop_timer();
+
+  return this->nbor->host_jlist.begin()-host_start;
+}
+// Instantiate the class
+template class GranHooke<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_gran_hooke.cu
+++ b/lib/gpu/lal_gran_hooke.cu
@ -0,0 +1,491 @@
+// **************************************************************************
+//                                 gran_hooke.cu
+//                             -------------------
+//                           Trung Dac Nguyen (ORNL)
+//
+//  Device code for acceleration of the gran/hooke pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : nguyentd@ornl.gov
+// ***************************************************************************
+
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include "lal_aux_fun1.h"
+// 确保类型定义可见
+#ifndef numtyp
+#define numtyp float
+#endif
+#ifndef numtyp4
+#define numtyp4 float4
+#endif
+#ifndef acctyp
+#define acctyp float
+#endif
+#ifndef acctyp3
+#define acctyp3 float3
+#endif
+#ifndef _DOUBLE_DOUBLE
+_texture( pos_tex,float4);
+_texture( rad_tex,float);
+_texture( vel_tex,float4);
+_texture( omega_tex,float4);
+_texture( ramss_tex,float);
+#else
+_texture_2d( pos_tex,int4);
+_texture( rad_tex,int2);
+_texture_2d( vel_tex,int4);
+_texture_2d( omega_tex,int4);
+_texture( ramss_tex,int2);
+#endif
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+#define rad_tex rad_
+#define vel_tex v_
+#define omega_tex omega_
+#endif
+
+#else
+#define pos_tex x_
+#define rad_tex rad_
+#define vel_tex v_
+#define omega_tex omega_
+#define ramss_tex mass_rigid_
+#define vel_rad_tex v_rad_
+#define omega_ramss_tex omega_ramss_
+#endif
+
+#define MIN(A,B) ((A) < (B) ? (A) : (B))
+
+#define store_answers_t(f, tor, energy, virial, ii, inum, tid,              \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      f.x += shfl_down(f.x, s, t_per_atom);                                 \
+      f.y += shfl_down(f.y, s, t_per_atom);                                 \
+      f.z += shfl_down(f.z, s, t_per_atom);                                 \
+      tor.x += shfl_down(tor.x, s, t_per_atom);                             \
+      tor.y += shfl_down(tor.y, s, t_per_atom);                             \
+      tor.z += shfl_down(tor.z, s, t_per_atom);                             \
+      if (EVFLAG) energy += shfl_down(energy, s, t_per_atom);               \
+    }                                                                       \
+    if (EVFLAG && vflag) {                                                  \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        for (int r=0; r<6; r++)                                             \
+          virial[r] += shfl_down(virial[r], s, t_per_atom);                 \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    __global acctyp *ap1=engv+ii;                                           \
+    if (EVFLAG && eflag) {                                                  \
+      *ap1=energy*(acctyp)0.5;                                              \
+      ap1+=inum;                                                            \
+    }                                                                       \
+    if (EVFLAG && vflag) {                                                  \
+      for (int i=0; i<6; i++) {                                             \
+        *ap1=virial[i]*(acctyp)0.5;                                         \
+        ap1+=inum;                                                          \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }
+
+__kernel void k_gran_hooke(const __global numtyp4 *restrict x_,                // 位置
+                                   const __global numtyp *restrict rad_,        // 半径
+                                   const __global numtyp4 *restrict v_,         // 速度
+                                   const __global numtyp4 *restrict omega_,     // 角速度
+                                   const __global numtyp *restrict sp_lj_in,    // 特殊LJ系数
+                                   const __global numtyp *restrict mass_rigid_, // 刚体质量
+                                   const __global int *dev_nbor,                // 邻居列表
+                                   const __global int *dev_packed,              // 打包的邻居
+                                   __global numtyp *restrict mask_,             // 
+                                   __global acctyp3 *restrict ans,              // 力
+                                   __global acctyp *restrict engv,              // 能量
+                                   const int eflag, const int vflag,            // 能量/维里标志
+                                   const int inum, const int nbor_pitch,        // 原子数/邻居间距
+                                   const int t_per_atom,                        // 每原子线程数
+                                   const numtyp kn, const numtyp kt,            // 法向/切向刚度
+                                   const numtyp gamman, const numtyp gammat,    // 法向/切向阻尼
+                                   const numtyp xmu, const int dampflag,        // 摩擦系数/阻尼标志
+                                   const int freeze_group_bit,                  // 冻结组位
+                                   const int limit_damping,                     // damping限制标志位
+                                   const numtyp dt) {                           // 时间步长
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  
+  __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  
+  acctyp3 f, torque_i;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  torque_i.x=(acctyp)0; torque_i.y=(acctyp)0; torque_i.z=(acctyp)0;
+  acctyp energy, virial[6];
+
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
+    
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+    
+    numtyp4 ix; fetch4(ix,i,pos_tex);
+    numtyp radi; fetch(radi,i,rad_tex);
+    numtyp rmassi; fetch(rmassi,i,mass_rigid_);
+    numtyp4 iv; fetch4(iv,i,vel_tex);
+    numtyp4 iomega; fetch4(iomega,i,omega_tex);
+    int maski = mask_[i];
+    
+    numtyp factor_lj;
+    
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
+
+      int j=dev_packed[nbor];
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+      
+      numtyp4 jx; fetch4(jx,j,pos_tex);
+      numtyp radj; fetch(radj,j,rad_tex);
+      numtyp rmassj; fetch(rmassj,j,mass_rigid_);
+      int maskj = mask_[j];
+
+      
+      if (factor_lj == 0.0) continue;
+      // 计算相对位置
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+      numtyp radsum = radi + radj;
+     
+      if (rsq < radsum*radsum)
+      {
+        numtyp r = ucl_sqrt(rsq);
+        numtyp rinv = ucl_recip(r);
+        numtyp rsqinv = ucl_recip(rsq);
+        
+        // 相对平移速度    
+        numtyp4 jv; fetch4(jv,j,vel_tex);
+        numtyp vr1 = iv.x-jv.x;
+        numtyp vr2 = iv.y-jv.y;
+        numtyp vr3 = iv.z-jv.z;
+
+        // 法向分量
+        numtyp vnnr = vr1*delx + vr2*dely + vr3*delz;
+        numtyp vn1 = delx*vnnr*rsqinv;
+        numtyp vn2 = dely*vnnr*rsqinv;
+        numtyp vn3 = delz*vnnr*rsqinv;
+
+        // 切向分量
+        numtyp vt1 = vr1 - vn1;
+        numtyp vt2 = vr2 - vn2;
+        numtyp vt3 = vr3 - vn3;
+
+        // 相对旋转速度
+
+        numtyp4 jomega; fetch4(jomega,j,omega_tex);
+        numtyp wr1 = (radi*iomega.x + radj*jomega.x)*rinv;
+        numtyp wr2 = (radi*iomega.y + radj*jomega.y)*rinv;
+        numtyp wr3 = (radi*iomega.z + radj*jomega.z)*rinv;
+        
+        // 有效质量
+        numtyp mi = rmassi;
+        numtyp mj = rmassj;
+        numtyp meff = (mi*mj)/(mi+mj);
+        if (maski & freeze_group_bit) meff = mj;
+        if (maskj & freeze_group_bit) meff = mi;
+
+        // 法向力 = 弹性 + 阻尼
+        numtyp damp = meff*gamman*vnnr*rsqinv;
+        numtyp ccel = kn*(radsum-r)*rinv - damp;
+        if (limit_damping&&(ccel < (numtyp)0.0)) ccel = (numtyp)0.0;
+
+        // 切向速度（考虑旋转）
+        numtyp vtr1 = vt1 - (delz*wr2 - dely*wr3);
+        numtyp vtr2 = vt2 - (delx*wr3 - delz*wr1);
+        numtyp vtr3 = vt3 - (dely*wr1 - delx*wr2);
+        numtyp vrel = ucl_sqrt(vtr1*vtr1+vtr2*vtr2+vtr3*vtr3);
+        
+        numtyp fn = xmu * ucl_abs(ccel*r);
+        numtyp fs = meff*gammat*vrel;
+        
+        numtyp ft = 0.0;
+        if (vrel != 0.0) ft = MIN(fn,fs) / vrel;
+
+        numtyp fs1 = -ft*vtr1;
+        numtyp fs2 = -ft*vtr2;
+        numtyp fs3 = -ft*vtr3;
+
+
+        numtyp fx = delx*ccel + fs1;
+        numtyp fy = dely*ccel + fs2;
+        numtyp fz = delz*ccel + fs3;
+
+
+        fx = factor_lj*fx;
+        fy = factor_lj*fy;
+        fz = factor_lj*fz;
+
+        f.x+=fx;
+        f.y+=fy;
+        f.z+=fz;
+        
+        // 力矩计算 (与CPU一致: torque = rinv * (r × fs))
+        numtyp tor1 = rinv * (dely*fs3 - delz*fs2);
+        numtyp tor2 = rinv * (delz*fs1 - delx*fs3);
+        numtyp tor3 = rinv * (delx*fs2 - dely*fs1);
+
+        tor1 *= factor_lj;
+        tor2 *= factor_lj;
+        tor3 *= factor_lj;
+        
+        // 更新i粒子力矩 (负号表示作用反力)
+        torque_i.x -= radi * tor1;
+        torque_i.y -= radi * tor2;
+        torque_i.z -= radi * tor3;
+        
+        if (EVFLAG && vflag) {
+          virial[0] += delx*fx;
+          virial[1] += dely*fy;
+          virial[2] += delz*fz;
+          virial[3] += delx*fy;
+          virial[4] += delx*fz;
+          virial[5] += dely*fz;
+        }
+         
+      } 
+    }
+  }
+  store_answers_t(f,torque_i,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
+}
+
+__kernel void k_gran_hooke_fast(const __global numtyp4 *restrict x_,         // 位置
+                                        const __global numtyp *restrict rad_,        // 半径
+                                        const __global numtyp4 *restrict v_,         // 速度
+                                        const __global numtyp4 *restrict omega_,     // 角速度
+                                        const __global numtyp *restrict sp_lj_in,    // 特殊LJ系数
+                                        const __global numtyp *restrict mass_rigid_, // 刚体质量
+                                        const __global int *dev_nbor,                // 邻居列表
+                                        const __global int *dev_packed,              // 打包的邻居
+                                        __global numtyp *restrict mask_,             // 
+                                        __global acctyp3 *restrict ans,              // 力
+                                        __global acctyp *restrict engv,              // 能量
+                                        const int eflag, const int vflag,            // 能量/维里标志
+                                        const int inum, const int nbor_pitch,        // 原子数/邻居间距
+                                        const int t_per_atom,                        // 每原子线程数
+                                        const numtyp kn, const numtyp kt,            // 法向/切向刚度
+                                        const numtyp gamman, const numtyp gammat,    // 法向/切向阻尼
+                                        const numtyp xmu, const int dampflag,        // 摩擦系数/阻尼标志
+                                        const int freeze_group_bit,                  // 冻结组位
+                                        const int limit_damping,                       // 剪切位移更新标志位
+                                        const numtyp dt) {                           // 时间步长
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  
+  
+
+  int n_stride;
+  local_allocate_store_pair();
+
+  acctyp3 f, torque_i;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  torque_i.x=(acctyp)0; torque_i.y=(acctyp)0; torque_i.z=(acctyp)0;
+  acctyp energy, virial[6];
+
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
+
+  int nbor, nbor_end;
+  int i, numj;
+  nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+            n_stride,nbor_end,nbor);
+    
+  __local numtyp sp_lj[4];
+  if (tid<4) {
+    sp_lj[tid]=sp_lj_in[tid];
+  }
+
+    //__local numtyp4 ixs[64];
+    __local numtyp4 ivs[64];
+    __local numtyp4 iomegas[64];
+    __local numtyp radis[64];
+    __local numtyp rmassis[64];
+
+    int idx=0;
+    idx=int(THREAD_ID_X/t_per_atom);
+    if(THREAD_ID_X%t_per_atom==0&&ii<inum)
+    {
+      //fetch4(ixs[idx],i,pos_tex);
+      fetch4(iomegas[idx],i,omega_tex);
+      fetch4(ivs[idx],i,vel_tex);
+      fetch(rmassis[idx],i,mass_rigid_);
+      fetch(radis[idx],i,rad_tex);
+    }
+
+  __syncthreads();
+
+  if (ii<inum) {
+    numtyp radi; radi=radis[idx];
+    numtyp rmassi; rmassi=rmassis[idx];
+    numtyp4 iomega; iomega=iomegas[idx];
+    numtyp4 iv; iv=ivs[idx];
+    /*
+    
+
+    __syncthreads();
+    //numtyp4 ix; ix=ixs[idx];
+    numtyp radi; radi=radis[idx];
+    //numtyp rmassi; rmassi=rmassis[idx];
+    //numtyp4 iv; iv=ivs[idx];
+    //numtyp4 iomega; iomega=iomegas[idx];
+    */
+    numtyp4 ix; fetch4(ix,i,pos_tex);
+    //numtyp radi; fetch(radi,i,rad_tex);
+    //numtyp rmassi; fetch(rmassi,i,mass_rigid_);
+    //numtyp4 iv; fetch4(iv,i,vel_tex);
+    //numtyp4 iomega; fetch4(iomega,i,omega_tex);
+   
+    int maski = mask_[i];
+
+    numtyp factor_lj;
+    
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
+
+      int j=dev_packed[nbor];
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+      
+      numtyp4 jx; fetch4(jx,j,pos_tex);
+      numtyp radj; fetch(radj,j,rad_tex);
+      numtyp rmassj; fetch(rmassj,j,mass_rigid_);
+      int maskj = mask_[j];
+
+      
+      if (factor_lj == 0.0) continue;
+      // 计算相对位置
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+      numtyp radsum = radi + radj;
+     
+      if (rsq < radsum*radsum)
+      {
+        numtyp r = ucl_sqrt(rsq);
+        numtyp rinv = ucl_recip(r);
+        numtyp rsqinv = ucl_recip(rsq);
+        
+        // 相对平移速度    
+        numtyp4 jv; fetch4(jv,j,vel_tex);
+        numtyp vr1 = iv.x-jv.x;
+        numtyp vr2 = iv.y-jv.y;
+        numtyp vr3 = iv.z-jv.z;
+
+        // 法向分量
+        numtyp vnnr = vr1*delx + vr2*dely + vr3*delz;
+        numtyp vn1 = delx*vnnr*rsqinv;
+        numtyp vn2 = dely*vnnr*rsqinv;
+        numtyp vn3 = delz*vnnr*rsqinv;
+
+        // 切向分量
+        numtyp vt1 = vr1 - vn1;
+        numtyp vt2 = vr2 - vn2;
+        numtyp vt3 = vr3 - vn3;
+
+        // 相对旋转速度
+
+        numtyp4 jomega; fetch4(jomega,j,omega_tex);
+        numtyp wr1 = (radi*iomega.x + radj*jomega.x)*rinv;
+        numtyp wr2 = (radi*iomega.y + radj*jomega.y)*rinv;
+        numtyp wr3 = (radi*iomega.z + radj*jomega.z)*rinv;
+        
+        // 有效质量
+        numtyp mi = rmassi;
+        numtyp mj = rmassj;
+        numtyp meff = (mi*mj)/(mi+mj);
+        if (maski & freeze_group_bit) meff = mj;
+        if (maskj & freeze_group_bit) meff = mi;
+
+        // 法向力 = 弹性 + 阻尼
+        numtyp damp = meff*gamman*vnnr*rsqinv;
+        numtyp ccel = kn*(radsum-r)*rinv - damp;
+        if (limit_damping&&(ccel < (numtyp)0.0)) ccel = (numtyp)0.0;
+
+        // 切向速度（考虑旋转）
+        numtyp vtr1 = vt1 - (delz*wr2 - dely*wr3);
+        numtyp vtr2 = vt2 - (delx*wr3 - delz*wr1);
+        numtyp vtr3 = vt3 - (dely*wr1 - delx*wr2);
+        numtyp vrel = ucl_sqrt(vtr1*vtr1+vtr2*vtr2+vtr3*vtr3);
+        
+        numtyp fn = xmu * ucl_abs(ccel*r);
+        numtyp fs = meff*gammat*vrel;
+        
+        numtyp ft = 0.0;
+        if (vrel != 0.0) ft = MIN(fn,fs) / vrel;
+
+        numtyp fs1 = -ft*vtr1;
+        numtyp fs2 = -ft*vtr2;
+        numtyp fs3 = -ft*vtr3;
+
+
+        numtyp fx = delx*ccel + fs1;
+        numtyp fy = dely*ccel + fs2;
+        numtyp fz = delz*ccel + fs3;
+
+        fx = factor_lj*fx;
+        fy = factor_lj*fy;
+        fz = factor_lj*fz;
+
+        f.x+=fx;
+        f.y+=fy;
+        f.z+=fz;
+        
+        // 力矩计算 (与CPU一致: torque = rinv * (r × fs))
+        numtyp tor1 = rinv * (dely*fs3 - delz*fs2);
+        numtyp tor2 = rinv * (delz*fs1 - delx*fs3);
+        numtyp tor3 = rinv * (delx*fs2 - dely*fs1);
+
+        tor1 *= factor_lj;
+        tor2 *= factor_lj;
+        tor3 *= factor_lj;
+        
+        // 更新i粒子力矩 (负号表示作用反力)
+        torque_i.x -= radi * tor1;
+        torque_i.y -= radi * tor2;
+        torque_i.z -= radi * tor3;
+        
+        if (EVFLAG && vflag) {
+          virial[0] += delx*fx;
+          virial[1] += dely*fy;
+          virial[2] += delz*fz;
+          virial[3] += delx*fy;
+          virial[4] += delx*fz;
+          virial[5] += dely*fz;
+        }
+         
+      } 
+    }
+  }
+  store_answers_t(f,torque_i,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
+}
--- a/lib/gpu/lal_gran_hooke.h
+++ b/lib/gpu/lal_gran_hooke.h
@ -0,0 +1,252 @@
+/***************************************************************************
+ *                               gran_hooke.h
+ *                              -------------------
+ *                            Trung Dac Nguyen (ORNL)
+ * 
+ *  Class for acceleration of the gran/hooke pair style.
+ * 
+ * __________________________________________________________________________
+ *    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ * __________________________________________________________________________
+ * 
+ *    begin                : 
+ *    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_GRAN_HOOKE_HISTORY_H
+#define LAL_GRAN_HOOKE_HISTORY_H
+
+#include "lal_base_sphere.h"
+#include "lal_atom.h"
+
+
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class GranHooke : public BaseSphere<numtyp, acctyp> {
+ public:
+  GranHooke();
+  ~GranHooke();
+
+  typedef struct { double x,y,z; } vec3d;
+  typedef struct { numtyp x,y,z,w; } vec4d_t;
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes,
+           double host_k_n, double host_k_t, 
+           double host_gamman, double host_gammat,
+           double host_xmu, double host_dt, const int host_dampflag,
+           double *host_special_lj, int *host_mask, 
+           const int nlocal, const int nall, const int max_nbors, const int maxspecial,
+           const double cell_size, const double gpu_split, FILE *screen);
+
+  inline void cast_rad_data(double* rad) {
+    int nall = this->atom->nall();
+    if (_shared_view) {
+      c_rad.host.view((numtyp*)rad,nall,*(this->ucl_device));
+      c_rad.device.view(c_rad.host);
+    } else {
+      if (sizeof(numtyp)==sizeof(double))
+        memcpy(c_rad.host.begin(),rad,nall*sizeof(numtyp));
+      else
+        for (int i=0; i<nall; i++) c_rad[i]=rad[i];
+    }
+  }
+
+  // Copy rad to device asynchronously
+  inline void add_rad_data() {
+    c_rad.update_device(this->atom->nall(),true);
+  }
+
+  inline void cast_rmass_data(double* rmass) {
+    int nall = this->atom->nall();
+    if (_shared_view) {
+      c_rmass.host.view((numtyp*)rmass,nall,*(this->ucl_device));
+      c_rmass.device.view(c_rmass.host);
+    } else {
+      if (sizeof(numtyp)==sizeof(double))
+        memcpy(c_rmass.host.begin(),rmass,nall*sizeof(numtyp));
+      else
+        for (int i=0; i<nall; i++) c_rmass[i]=rmass[i];
+    }
+  }
+
+  // Copy rmass to device asynchronously
+  inline void add_rmass_data() {
+    c_rmass.update_device(this->atom->nall(),true);
+  }
+
+  /*inline void cast_omega_data(double** omega) {
+    int nall = this->atom->nall();
+    if (_shared_view) {
+      c_omega.host.view((numtyp*)omega,nall,*(this->ucl_device));
+      c_omega.device.view(c_omega.host);
+    } else {
+      if (sizeof(numtyp)==sizeof(double))
+        memcpy(c_omega.host.begin(),omega,nall*sizeof(numtyp));
+      else
+        for (int i=0; i<nall; i++)
+        {
+            c_omega[i]=omega[i];
+        } 
+    }
+  }
+
+  // Copy rmass to device asynchronously
+  inline void add_omega_data() {
+    c_omega.update_device(this->atom->nall(),true);
+  }*/
+
+  inline void cast_omega_data(double** omega) {
+    int nall = this->atom->nall();
+    #ifdef GPU_CAST
+    memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double));
+    memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int));
+    #else
+    vec3d *host_p=reinterpret_cast<vec3d*>(&(omega[0][0]));
+    vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(c_omega[0]));
+    #if (LAL_USE_OMP == 1)
+    #pragma omp parallel for schedule(static)
+    #endif
+    for (int i=0; i<nall; i++) {
+        vp[i].x=host_p[i].x;
+        vp[i].y=host_p[i].y;
+        vp[i].z=host_p[i].z;
+        vp[i].w=0.0;
+    }
+    #endif
+  }
+
+
+  // Copy rmass to device asynchronously
+  inline void add_omega_data() {
+    c_omega.update_device(this->atom->nall()*4,true);
+  }
+  
+  inline void cast_vel_data(double** vel) {
+    int nall = this->atom->nall();
+    #ifdef GPU_CAST
+    memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double));
+    memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int));
+    #else
+    vec3d *host_p=reinterpret_cast<vec3d*>(&(vel[0][0]));
+    vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(c_vel[0]));
+    #if (LAL_USE_OMP == 1)
+    #pragma omp parallel for schedule(static)
+    #endif
+    for (int i=0; i<nall; i++) {
+        vp[i].x=host_p[i].x;
+        vp[i].y=host_p[i].y;
+        vp[i].z=host_p[i].z;
+        vp[i].w=0.0;
+    }
+    #endif
+  }
+
+
+  // Copy rmass to device asynchronously
+  inline void add_vel_data() {
+    c_vel.update_device(this->atom->nall()*4,true);
+  }
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  void compute(const int ago, const int inum_full,
+               const int nall, double **host_x, int *host_type,
+               int *ilist, int *numj, int **firstneigh,
+               const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom,
+               int &host_start, const double cpu_time,
+               bool &success, tagint *tag, 
+               double **host_v, double *host_rad, 
+               double **host_omega,  double *host_rmass, int limit_damping,
+               const int nlocal, double *boxlo, double *prd);
+
+  int **compute(const int ago, const int inum_full,
+                const int nall, double **host_x, int *host_type,
+                double *sublo, double *subhi, tagint *tag,
+                int **nspecial, tagint **special,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom,
+                int &host_start, int **ilist, int **jnum,
+                const double cpu_time, bool &success,
+                double **host_v, double *host_rad, 
+                double **host_omega, double *host_rmass, int limit_damping,
+                double *boxlo, double *prd);
+
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// coeff.x = k_n, coeff.y = k_t, coeff.z = cutsq, coeff.w = gamman
+  UCL_D_Vec<numtyp4> coeff;
+
+  numtyp _k_n;
+  numtyp _k_t;                   
+  numtyp _gamman; 
+  numtyp _gammat;
+  numtyp _xmu; 
+  numtyp _dt;
+  numtyp _dampflag;
+  int _limit_damping;
+  /// coeff2.x = gammat, coeff2.y = xmu
+  UCL_D_Vec<numtyp2> coeff2;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  UCL_D_Vec<numtyp> _mask;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  /// Per-atom arrays
+  UCL_Vector<numtyp,numtyp> c_rad;
+
+  UCL_Vector<numtyp,numtyp> c_omega;
+
+  UCL_Vector<numtyp,numtyp> c_vel;
+
+  UCL_Vector<numtyp,numtyp> c_rmass;
+
+// --------------------------- TEXTURES -----------------------------
+  UCL_Texture rad_tex;
+
+  UCL_Texture omega_tex;
+
+  UCL_Texture vel_tex;
+
+  UCL_Texture rmass_tex;
+
+  numtyp freeze_group_bit;
+
+  int _max_rad_size;
+
+ private:
+  bool _shared_view;
+  bool _allocated;
+  int loop(const int _eflag, const int _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_gran_hooke_ext.cpp
+++ b/lib/gpu/lal_gran_hooke_ext.cpp
@ -0,0 +1,134 @@
+/***************************************************************************
+ *                             gran_hooke_ext.cpp
+ *                              -------------------
+ *                             Trung Dac Nguyen (ORNL)
+ * 
+ *   Functions for LAMMPS access to gran/hooke acceleration routines.
+ * 
+ * __________________________________________________________________________
+ *    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ * __________________________________________________________________________
+ * 
+ *    begin                : 
+ *    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_gran_hooke.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static GranHooke<PRECISION,ACC_PRECISION> GHM;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int gran_hooke_gpu_init(const int ntypes, double **cutsq,
+                              const double host_k_n, const double host_k_t,
+                              const double host_gamman, const double host_gammat,
+                              const double host_xmu, const double host_dt, const int host_dampflag,
+                              double *special_lj, int *host_mask,
+                              const int inum, const int nall, const int max_nbors, 
+                              const int maxspecial, const double cell_size, int &gpu_mode, 
+                              FILE *screen) {
+  GHM.clear();
+  gpu_mode=GHM.device->gpu_mode();
+  double gpu_split=GHM.device->particle_split();
+  int first_gpu=GHM.device->first_device();
+  int last_gpu=GHM.device->last_device();
+  int world_me=GHM.device->world_me();
+  int gpu_rank=GHM.device->gpu_rank();
+  int procs_per_gpu=GHM.device->procs_per_gpu();
+
+  GHM.device->init_message(screen,"gran/hooke",first_gpu,last_gpu);
+
+  bool message=false;
+  if (GHM.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing Device and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=GHM.init(ntypes, host_k_n, host_k_t, host_gamman,
+                     host_gammat, host_xmu, host_dt, host_dampflag, special_lj,
+                     host_mask, inum, nall, max_nbors, maxspecial, cell_size, 
+                     gpu_split, screen);
+
+  GHM.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=GHM.init(ntypes, host_k_n, host_k_t, host_gamman,
+                     host_gammat, host_xmu, host_dt, host_dampflag, special_lj,
+                     host_mask, inum, nall, max_nbors, maxspecial, cell_size, 
+                     gpu_split, screen);
+
+    GHM.device->serialize_init();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    GHM.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void gran_hooke_gpu_clear() {
+  GHM.clear();
+}
+
+int ** gran_hooke_gpu_compute_n(const int ago, const int inum_full,
+                                      const int nall, double **host_x, int *host_type,
+                                      double *sublo, double *subhi, tagint *tag,
+                                      int **nspecial, tagint **special,
+                                      const bool eflag, const bool vflag,
+                                      const bool eatom, const bool vatom,
+                                      int &host_start, int **ilist, int **jnum,
+                                      const double cpu_time, bool &success,
+                                      double **host_v, double *host_rad, 
+                                      double **host_omega, double *host_rmass, int limit_damping,
+                                      double *boxlo, double *prd) {
+  return GHM.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
+                     tag, nspecial, special, eflag, vflag, eatom, vatom,
+                     host_start, ilist, jnum, cpu_time, success, host_v, host_rad,
+                     host_omega, host_rmass, limit_damping, boxlo, prd);
+}
+
+void gran_hooke_gpu_compute(const int ago, const int inum_full,
+                                    const int nall, double **host_x, int *host_type,
+                                    int *ilist, int *numj, int **firstneigh,
+                                    const bool eflag, const bool vflag,
+                                    const bool eatom, const bool vatom,
+                                    int &host_start, const double cpu_time,
+                                    bool &success, tagint *tag, 
+                                    double **host_v, double *host_rad, 
+                                    double **host_omega,  double *host_rmass, int limit_damping,
+                                    const int nlocal, double *boxlo, double *prd) {
+  GHM.compute(ago, inum_full, nall, host_x, host_type, ilist, numj,
+              firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time,
+              success, tag, host_v, host_rad, host_omega, host_rmass, limit_damping, nlocal, boxlo, prd);
+}
+
+double gran_hooke_gpu_bytes() {
+  return GHM.host_memory_usage();
+}
--- a/src/GPU/pair_gran_hooke_gpu.cpp
+++ b/src/GPU/pair_gran_hooke_gpu.cpp
@ -0,0 +1,477 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Trung Dac Nguyen (ORNL)
+------------------------------------------------------------------------- */
+
+#include "pair_gran_hooke_gpu.h"
+
+#include "atom.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "neigh_list.h"
+#include "neighbor.h"
+#include "suffix.h"
+#include "update.h"
+#include "memory.h"
+#include "comm.h"
+
+#include <cmath>
+
+using namespace LAMMPS_NS;
+
+// External functions from cuda library for atom decomposition
+
+int gran_hooke_gpu_init(const int ntypes, double **cutsq,
+                              const double host_k_n, const double host_k_t,
+                              const double host_gamman, const double host_gammat,
+                              const double host_xmu, const double host_dt, const int host_dampflag,
+                              double *special_lj, int *host_mask, 
+                              const int inum, const int nall, const int max_nbors, 
+                              const int maxspecial, const double cell_size, int &gpu_mode, 
+                              FILE *screen);
+void gran_hooke_gpu_clear();
+int ** gran_hooke_gpu_compute_n(const int ago, const int inum_full,
+                                      const int nall, double **host_x, int *host_type,
+                                      double *sublo, double *subhi, tagint *tag,
+                                      int **nspecial, tagint **special,
+                                      const bool eflag, const bool vflag,
+                                      const bool eatom, const bool vatom,
+                                      int &host_start, int **ilist, int **jnum,
+                                      const double cpu_time, bool &success,
+                                      double **host_v, double *host_rad, 
+                                      double **host_omega, double *host_rmass, int limit_damping,
+                                      double *boxlo, double *prd);
+void gran_hooke_gpu_compute(const int f_ago, const int inum_full,
+                                    const int nall, double **host_x, int *host_type,
+                                    int *ilist, int *numj, int **firstneigh,
+                                    const bool eflag, const bool vflag,
+                                    const bool eatom, const bool vatom,
+                                    int &host_start, const double cpu_time,
+                                    bool &success, tagint *tag, 
+                                    double **host_v, double *host_rad, 
+                                    double **host_omega,  double *host_rmass, int limit_damping,
+                                    const int nlocal, double *boxlo, double *prd);
+double gran_hooke_gpu_bytes();
+
+/* ---------------------------------------------------------------------- */
+
+PairGranHookeGPU::PairGranHookeGPU(LAMMPS *lmp) :
+    PairGranHooke(lmp), gpu_mode(GPU_FORCE)
+{
+  respa_enable = 0;
+  reinitflag = 0;
+  cpu_time = 0.0;
+  suffix_flag |= Suffix::GPU;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairGranHookeGPU::~PairGranHookeGPU()
+{
+  gran_hooke_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGranHookeGPU::compute(int eflag, int vflag)
+{
+  ev_init(eflag, vflag);
+
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+
+  int shearupdate = 1;
+  if (update->setupflag) shearupdate = 0;
+
+  if (gpu_mode != GPU_FORCE) {
+    double sublo[3], subhi[3];
+    if (domain->triclinic == 0) {
+      sublo[0] = domain->sublo[0];
+      sublo[1] = domain->sublo[1];
+      sublo[2] = domain->sublo[2];
+      subhi[0] = domain->subhi[0];
+      subhi[1] = domain->subhi[1];
+      subhi[2] = domain->subhi[2];
+    } else {
+      domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi);
+    }
+
+    
+    inum = atom->nlocal;
+    firstneigh = gran_hooke_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type,
+                                                 sublo, subhi, atom->tag, atom->nspecial,
+                                                 atom->special, eflag, vflag, eflag_atom,
+                                                 vflag_atom, host_start, &ilist, &numneigh,
+                                                 cpu_time, success, atom->v,atom->radius, atom->omega, atom->rmass,
+                                                 limit_damping, domain->boxlo, domain->prd);
+  } else {
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    gran_hooke_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh,
+                                  firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start,
+                                  cpu_time, success, atom->tag, atom->v, atom->radius, atom->omega, atom->rmass,
+                                  limit_damping, atom->nlocal, domain->boxlo, domain->prd);
+  }
+  if (!success) error->one(FLERR, "Insufficient memory on accelerator");
+
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
+  if (host_start < inum) {
+    cpu_time = platform::walltime();
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
+    cpu_time = platform::walltime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairGranHookeGPU::init_style()
+{
+  if (!atom->radius_flag || !atom->rmass_flag || !atom->omega_flag)
+    error->all(FLERR, "Pair style gran/hooke/history/gpu requires atom attributes radius, rmass, omega");
+  if (comm->ghost_velocity == 0)
+    error->all(FLERR, "Pair gran/h* requires ghost atoms store velocity");
+
+  if (gpu_mode == GPU_FORCE) 
+    {
+      if (history)
+        neighbor->add_request(this, NeighConst::REQ_SIZE | NeighConst::REQ_HISTORY);
+      else
+        neighbor->add_request(this, NeighConst::REQ_SIZE);
+    }
+
+  // Repeat cutsq calculation because done after call to init_style
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i, j);
+        cut *= cut;
+        if (cut > maxcut) maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial = 0;
+  if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
+  //dt = update->dt;
+
+  if (history && (fix_history == nullptr)) {
+    auto cmd = fmt::format("NEIGH_HISTORY_HH{} all NEIGH_HISTORY {}", instance_me, size_history);
+    fix_history = dynamic_cast<FixNeighHistory *>(
+        modify->replace_fix("NEIGH_HISTORY_HH_DUMMY" + std::to_string(instance_me), cmd, 1));
+    fix_history->pair = this;
+  }
+
+  // check for FixFreeze and set freeze_group_bit
+
+  auto fixlist = modify->get_fix_by_style("^freeze");
+  if (fixlist.size() == 0)
+    freeze_group_bit = 0;
+  else if (fixlist.size() > 1)
+    error->all(FLERR, "Only one fix freeze command at a time allowed");
+  else
+    freeze_group_bit = fixlist.front()->groupbit;
+
+  // check for FixRigid so can extract rigid body masses
+
+  fix_rigid = nullptr;
+  for (const auto &ifix : modify->get_fix_list()) {
+    if (ifix->rigid_flag) {
+      if (fix_rigid)
+        error->all(FLERR, "Only one fix rigid command at a time allowed");
+      else
+        fix_rigid = ifix;
+    }
+  }
+
+  // check for FixPour and FixDeposit so can extract particle radii
+
+  auto pours = modify->get_fix_by_style("^pour");
+  auto deps = modify->get_fix_by_style("^deposit");
+
+  // set maxrad_dynamic and maxrad_frozen for each type
+  // include future FixPour and FixDeposit particles as dynamic
+
+  int itype;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    onerad_dynamic[i] = onerad_frozen[i] = 0.0;
+    for (auto &ipour : pours) {
+      itype = i;
+      double maxrad = *((double *) ipour->extract("radius", itype));
+      if (maxrad > 0.0) onerad_dynamic[i] = maxrad;
+    }
+    for (auto &idep : deps) {
+      itype = i;
+      double maxrad = *((double *) idep->extract("radius", itype));
+      if (maxrad > 0.0) onerad_dynamic[i] = maxrad;
+    }
+  }
+
+  double *radius = atom->radius;
+  int *mask = atom->mask;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+    if (mask[i] & freeze_group_bit)
+      onerad_frozen[type[i]] = MAX(onerad_frozen[type[i]], radius[i]);
+    else
+      onerad_dynamic[type[i]] = MAX(onerad_dynamic[type[i]], radius[i]);
+  }
+
+  MPI_Allreduce(&onerad_dynamic[1], &maxrad_dynamic[1], atom->ntypes, MPI_DOUBLE, MPI_MAX, world);
+  MPI_Allreduce(&onerad_frozen[1], &maxrad_frozen[1], atom->ntypes, MPI_DOUBLE, MPI_MAX, world);
+
+  // set fix which stores history info
+
+  if (history) {
+    fix_history = dynamic_cast<FixNeighHistory *>(
+        modify->get_fix_by_id("NEIGH_HISTORY_HH" + std::to_string(instance_me)));
+    if (!fix_history) error->all(FLERR, "Could not find pair fix neigh history ID");
+  }
+
+  int success = gran_hooke_gpu_init(atom->ntypes + 1, cutsq, kn, kt, gamman, gammat, xmu, update->dt,
+                                          dampflag, force->special_lj, atom->mask, 
+                                          atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial,
+                                          cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success, error, world);
+
+ 
+}
+
+void PairGranHookeGPU::cpu_compute(int start, int inum, int eflag, int vflag, int *ilist,
+                              int *numneigh, int **firstneigh)
+{
+  int i, j, ii, jj, jnum;
+  double xtmp, ytmp, ztmp, delx, dely, delz, fx, fy, fz;
+  double radi, radj, radsum, rsq, r, rinv, rsqinv, factor_lj;
+  double vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3;
+  double wr1, wr2, wr3;
+  double vtr1, vtr2, vtr3, vrel;
+  double mi, mj, meff, damp, ccel, tor1, tor2, tor3;
+  double fn,fs,ft,fs1,fs2,fs3;
+  double shrmag, rsht;
+  int *jlist;
+  int *touch, **firsttouch;
+  double *shear, *allshear, **firstshear;
+
+  ev_init(eflag, vflag);
+
+  int shearupdate = 1;
+  if (update->setupflag) shearupdate = 0;
+
+  // update rigid body info for owned & ghost atoms if using FixRigid masses
+  // body[i] = which body atom I is in, -1 if none
+  // mass_body = mass of each rigid body
+
+  if (fix_rigid && neighbor->ago == 0) {
+    int tmp;
+    int *body = (int *) fix_rigid->extract("body", tmp);
+    auto mass_body = (double *) fix_rigid->extract("masstotal", tmp);
+    if (atom->nmax > nmax) {
+      memory->destroy(mass_rigid);
+      nmax = atom->nmax;
+      memory->create(mass_rigid, nmax, "pair:mass_rigid");
+    }
+    int nlocal = atom->nlocal;
+    for (i = 0; i < nlocal; i++)
+      if (body[i] >= 0)
+        mass_rigid[i] = mass_body[body[i]];
+      else
+        mass_rigid[i] = 0.0;
+    comm->forward_comm(this);
+  }
+
+  double **x = atom->x;
+  double **v = atom->v;
+  double **f = atom->f;
+  double **omega = atom->omega;
+  double **torque = atom->torque;
+  double *radius = atom->radius;
+  double *rmass = atom->rmass;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  int newton_pair = force->newton_pair;
+  double *special_lj = force->special_lj;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+  firsttouch = fix_history->firstflag;
+  firstshear = fix_history->firstvalue;
+
+  // loop over neighbors of my atoms
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    radi = radius[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      if (factor_lj == 0) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      radj = radius[j];
+      radsum = radi + radj;
+
+      if (rsq < radsum*radsum) {
+        r = sqrt(rsq);
+        rinv = 1.0/r;
+        rsqinv = 1.0/rsq;
+
+        // relative translational velocity
+
+        vr1 = v[i][0] - v[j][0];
+        vr2 = v[i][1] - v[j][1];
+        vr3 = v[i][2] - v[j][2];
+
+        // normal component
+
+        vnnr = vr1*delx + vr2*dely + vr3*delz;
+        vn1 = delx*vnnr * rsqinv;
+        vn2 = dely*vnnr * rsqinv;
+        vn3 = delz*vnnr * rsqinv;
+
+        // tangential component
+
+        vt1 = vr1 - vn1;
+        vt2 = vr2 - vn2;
+        vt3 = vr3 - vn3;
+
+        // relative rotational velocity
+
+        wr1 = (radi*omega[i][0] + radj*omega[j][0]) * rinv;
+        wr2 = (radi*omega[i][1] + radj*omega[j][1]) * rinv;
+        wr3 = (radi*omega[i][2] + radj*omega[j][2]) * rinv;
+
+        // meff = effective mass of pair of particles
+        // if I or J part of rigid body, use body mass
+        // if I or J is frozen, meff is other particle
+
+        mi = rmass[i];
+        mj = rmass[j];
+        if (fix_rigid) {
+          if (mass_rigid[i] > 0.0) mi = mass_rigid[i];
+          if (mass_rigid[j] > 0.0) mj = mass_rigid[j];
+        }
+
+        meff = mi*mj / (mi+mj);
+        if (mask[i] & freeze_group_bit) meff = mj;
+        if (mask[j] & freeze_group_bit) meff = mi;
+
+        // normal forces = Hookian contact + normal velocity damping
+
+        damp = meff*gamman*vnnr*rsqinv;
+        ccel = kn*(radsum-r)*rinv - damp;
+        if (limit_damping && (ccel < 0.0)) ccel = 0.0;
+
+        // relative velocities
+
+        vtr1 = vt1 - (delz*wr2-dely*wr3);
+        vtr2 = vt2 - (delx*wr3-delz*wr1);
+        vtr3 = vt3 - (dely*wr1-delx*wr2);
+        vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
+        vrel = sqrt(vrel);
+
+        // force normalization
+
+        fn = xmu * fabs(ccel*r);
+        fs = meff*gammat*vrel;
+        if (vrel != 0.0) ft = MIN(fn,fs) / vrel;
+        else ft = 0.0;
+
+        // tangential force due to tangential velocity damping
+
+        fs1 = -ft*vtr1;
+        fs2 = -ft*vtr2;
+        fs3 = -ft*vtr3;
+
+        // forces & torques
+
+        fx = delx*ccel + fs1;
+        fy = dely*ccel + fs2;
+        fz = delz*ccel + fs3;
+        fx *= factor_lj;
+        fy *= factor_lj;
+        fz *= factor_lj;
+        f[i][0] += fx;
+        f[i][1] += fy;
+        f[i][2] += fz;
+
+        tor1 = rinv * (dely*fs3 - delz*fs2);
+        tor2 = rinv * (delz*fs1 - delx*fs3);
+        tor3 = rinv * (delx*fs2 - dely*fs1);
+        tor1 *= factor_lj;
+        tor2 *= factor_lj;
+        tor3 *= factor_lj;
+        torque[i][0] -= radi*tor1;
+        torque[i][1] -= radi*tor2;
+        torque[i][2] -= radi*tor3;
+
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= fx;
+          f[j][1] -= fy;
+          f[j][2] -= fz;
+          torque[j][0] -= radj*tor1;
+          torque[j][1] -= radj*tor2;
+          torque[j][2] -= radj*tor3;
+        }
+
+        if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+                                 0.0,0.0,fx,fy,fz,delx,dely,delz);
+      }
+    }
+  }
+
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairGranHookeGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + gran_hooke_gpu_bytes();
+}
--- a/src/GPU/pair_gran_hooke_gpu.h
+++ b/src/GPU/pair_gran_hooke_gpu.h
@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(gran/hooke/gpu,PairGranHookeGPU);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_GRAN_HOOKE_GPU_H
+#define LMP_PAIR_GRAN_HOOKE_GPU_H
+
+#include "pair_gran_hooke.h"
+#include "fix_neigh_history.h"
+
+namespace LAMMPS_NS {
+
+class PairGranHookeGPU : public PairGranHooke {
+ public:
+  PairGranHookeGPU(LAMMPS *lmp);
+  ~PairGranHookeGPU() override;
+  void cpu_compute(int, int, int, int, int *, int *, int **);
+  void compute(int, int) override;
+  void init_style() override;
+  double memory_usage() override;
+
+  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+};
+
+}    // namespace LAMMPS_NS
+#endif
+#endif