feat(gpu/pair): 添加 Hooke 颗粒力模型的 GPU 加速支持

实现了颗粒间 Hooke 力模型的 GPU 加速计算,主要包含以下内容:

* 引入基础类 BaseSphere 处理球形颗粒的位置、半径和角速度数据
* 新增 GranHooke 类实现法向/切向弹性力与阻尼力的 GPU 计算逻辑
* 使用纹理内存优化颗粒属性访问(半径、质量、速度、角速度)
* 实现设备端邻居列表构建和并行力计算内核
* 支持与 LAMMPS 的 FixNeighHistory 集成处理接触历史数据
* 添加 GPU 与 CPU 数据转换接口,支持混合粒子分组和冻结状态处理
* 整合到 LAMMPS 的 GPU 模块,可通过 pair_style gran/hooke/gpu 调用
This commit is contained in:
2025-07-07 14:37:56 +08:00
parent 63fcdb6e52
commit 6dcfc82be5
8 changed files with 2298 additions and 0 deletions

331
lib/gpu/lal_base_sphere.cpp Normal file
View File

@ -0,0 +1,331 @@
/***************************************************************************
base_sphere.cpp
-------------------
Trung Dac Nguyen (ORNL)
Base class for pair styles needing per-particle data for position,
radius, angular velocity, and type.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include "lal_base_sphere.h"
namespace LAMMPS_AL {
#define BaseSphereT BaseSphere<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp>
BaseSphereT::BaseSphere() : _compiled(false), _max_bytes(0) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor();
pair_program=nullptr;
ucl_device=nullptr;
#if defined(LAL_OCL_EV_JIT)
pair_program_noev=nullptr;
#endif
}
template <class numtyp, class acctyp>
BaseSphereT::~BaseSphere() {
delete ans;
delete nbor;
k_pair_fast.clear();
k_pair.clear();
if (pair_program) delete pair_program;
#if defined(LAL_OCL_EV_JIT)
k_pair_noev.clear();
if (pair_program_noev) delete pair_program_noev;
#endif
}
template <class numtyp, class acctyp>
int BaseSphereT::bytes_per_atom_atomic(const int max_nbors) const {
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int BaseSphereT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program,
const char *k_name) {
screen=_screen;
int gpu_nbor=0;
if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=1;
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
_threads_per_atom=device->threads_per_atom();
bool charge = false;
bool rot = true;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;
if (ucl_device!=device->gpu) _compiled=false;
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->pair_block_size();
compile_kernels(*ucl_device,pair_program,k_name);
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;
hd_balancer.init(device,gpu_nbor,gpu_split);
time_pair.init(*ucl_device);
time_pair.zero();
pos_tex.bind_float(atom->x,4);
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
return 0;
}
template <class numtyp, class acctyp>
void BaseSphereT::estimate_gpu_overhead(const int add_kernels) {
device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
void BaseSphereT::clear_atomic() {
acc_timers();
double avg_split=hd_balancer.all_avg_split();
_gpu_overhead*=hd_balancer.timestep();
_driver_overhead*=hd_balancer.timestep();
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
time_pair.clear();
hd_balancer.clear();
nbor->clear();
ans->clear();
}
template <class numtyp, class acctyp>
int * BaseSphereT::reset_nbors(const int nall, const int inum, int *ilist,
int *numj, int **firstneigh, bool &success) {
success=true;
int mn=nbor->max_nbor_loop(inum,numj,ilist);
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
return nullptr;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return ilist;
}
template <class numtyp, class acctyp>
void BaseSphereT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, tagint *tag,
int **nspecial, tagint **special,
bool &success) {
success=true;
resize_atom(inum,nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),success);
if (!success)
return;
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
tag, nspecial, special, success, mn, ans->error_flag);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
template <class numtyp, class acctyp>
void BaseSphereT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, tagint *tag,
const int nlocal,
double *boxlo, double *prd) {
acc_timers();
int eflags = eflag ? 1 : 0;
int vflags = vflag ? 1 : 0;
if (eatom) eflags=2;
if (vatom) vflags=2;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflags) eflags=2;
if (vflags) vflags=2;
#endif
set_kernel(eflags, vflags);
if (inum_full==0) {
host_start=0;
resize_atom(0,nall,success);
zero_timers();
return;
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum);
host_start=inum;
if (ago==0) {
reset_nbors(nall, inum, ilist, numj, firstneigh, success);
if (!success)
return;
}
atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
const int red_blocks=loop(eflags, vflags);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist,red_blocks);
device->add_ans_object(ans);
hd_balancer.stop_timer();
}
template <class numtyp, class acctyp>
int **BaseSphereT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success,
double *boxlo, double *prd) {
acc_timers();
int eflags = eflag ? 1 : 0;
int vflags = vflag ? 1 : 0;
if (eatom) eflags=2;
if (vatom) vflags=2;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflags) eflags=2;
if (vflags) vflags=2;
#endif
set_kernel(eflags, vflags);
if (inum_full==0) {
host_start=0;
resize_atom(0,nall,success);
zero_timers();
return nullptr;
}
hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum);
host_start=inum;
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return nullptr;
hd_balancer.start_timer();
} else {
atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
const int red_blocks=loop(eflags, vflags);
ans->copy_answers(eflag,vflag,eatom,vatom,red_blocks);
device->add_ans_object(ans);
hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start;
}
template <class numtyp, class acctyp>
double BaseSphereT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(BaseSphere<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void BaseSphereT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *k) {
if (_compiled)
return;
std::string s_fast=std::string(k)+"_fast";
if (pair_program) delete pair_program;
pair_program=new UCL_Program(dev);
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,k);
pos_tex.get_texture(*pair_program,"pos_tex");
#if defined(LAL_OCL_EV_JIT)
oclstring = device->compile_string()+" -DEVFLAG=0";
if (pair_program_noev) delete pair_program_noev;
pair_program_noev=new UCL_Program(dev);
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
#else
k_pair_sel = &k_pair_fast;
#endif
_compiled=true;
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
if (dev.has_subgroup_support()) {
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
#if defined(LAL_OCL_EV_JIT)
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
#endif
if (_threads_per_atom > (int)mx_subgroup_sz) _threads_per_atom = mx_subgroup_sz;
device->set_simd_size(mx_subgroup_sz);
}
#endif
}
template class BaseSphere<PRECISION,ACC_PRECISION>;
}

176
lib/gpu/lal_base_sphere.h Normal file
View File

@ -0,0 +1,176 @@
/***************************************************************************
base_sphere.h
-------------------
Trung Dac Nguyen (ORNL)
Base class for pair styles needing per-particle data for position,
radius, angular velocity, and type.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_BASE_SPHERE_H
#define LAL_BASE_SPHERE_H
#include "lal_device.h"
#include "lal_balance.h"
#include "mpi.h"
#ifdef USE_OPENCL
#include "geryon/ocl_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class BaseSphere {
public:
BaseSphere();
virtual ~BaseSphere();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const void *pair_program, const char *k_name);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(const int add_kernels=0);
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(nall, success)) {
pos_tex.bind_float(atom->x,4);
}
ans->resize(inum,success);
}
/// Check if there is enough storage for neighbors and realloc if not
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success);
}
/// Clear all host and device data
void clear_atomic();
/// Returns memory usage on device per atom
int bytes_per_atom_atomic(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage_atomic() const;
/// Accumulate timers
inline void acc_timers() {
if (device->time_device()) {
nbor->acc_timers(screen);
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
}
}
/// Zero timers
inline void zero_timers() {
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
}
/// Copy neighbor list from host
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
int **firstneigh, bool &success);
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, bool &success);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, tagint *tag,
const int nlocal, double *boxlo, double *prd);
/// Pair loop with device neighboring
int **compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success,
double *boxlo, double *prd);
// -------------------------- DEVICE DATA -------------------------
Device<numtyp,acctyp> *device;
UCL_Device *ucl_device;
UCL_Timer time_pair;
Balance<numtyp,acctyp> hd_balancer;
FILE *screen;
// --------------------------- ATOM DATA --------------------------
Atom<numtyp,acctyp> *atom;
// ------------------------ FORCE/ENERGY DATA -----------------------
Answer<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
Neighbor *nbor;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program, *pair_program_noev;
UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
inline int block_size() { return _block_size; }
inline void set_kernel(const int eflag, const int vflag) {
#if defined(LAL_OCL_EV_JIT)
if (eflag || vflag) k_pair_sel = &k_pair_fast;
else k_pair_sel = &k_pair_noev;
#endif
}
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
protected:
bool _compiled;
int _block_size, _threads_per_atom;
double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
virtual int loop(const int eflag, const int vflag) = 0;
};
}
#endif

391
lib/gpu/lal_gran_hooke.cpp Normal file
View File

@ -0,0 +1,391 @@
/***************************************************************************
* gran_hooke.cpp
* -------------------
* Trung Dac Nguyen (ORNL)
*
* Class for acceleration of the gran/hooke pair style.
*
* __________________________________________________________________________
* This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
* __________________________________________________________________________
*
* begin :
* email : nguyentd@ornl.gov
***************************************************************************/
#if defined(USE_OPENCL)
#include "gran_hooke_cl.h"
#elif defined(USE_CUDART)
const char *gran_hooke=0;
#else
#include "gran_hooke_cubin.h"
#endif
#include "lal_gran_hooke.h"
#include <cassert>
using namespace LAMMPS_AL;
#define GranHookeT GranHooke<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
GranHookeT::GranHooke() : BaseSphere<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
GranHookeT::~GranHooke() {
clear();
}
template <class numtyp, class acctyp>
int GranHookeT::init(const int ntypes,
double host_k_n, double host_k_t,
double host_gamman, double host_gammat,
double host_xmu, double host_dt, const int host_dampflag,
double *host_special_lj, int *host_mask,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen) {
int onetype=0;
int success;
success = this->init_atomic(nlocal, nall, max_nbors, maxspecial,
cell_size, gpu_split, screen,
gran_hooke, "k_gran_hooke");
if (success != 0)
return success;
_shared_view = this->ucl_device->shared_memory() && sizeof(numtyp)==sizeof(double);
// allocate rad
int ef_nall=nall;
if (ef_nall==0)
ef_nall=2000;
_max_rad_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
if (!_shared_view)
{
c_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
c_rmass.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
c_omega.alloc(_max_rad_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
c_vel.alloc(_max_rad_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
}
rad_tex.get_texture(*(this->pair_program),"rad_tex");
rad_tex.bind_float(c_rad,1);
rmass_tex.get_texture(*(this->pair_program),"rmass_tex");
rmass_tex.bind_float(c_rmass,1);
omega_tex.get_texture(*(this->pair_program),"omega_tex");
omega_tex.bind_float(c_omega,4);
vel_tex.get_texture(*(this->pair_program),"vel_tex");
vel_tex.bind_float(c_vel,4);
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {//TODO
lj_types=max_shared_types;
shared_types=true;
}
//shared_types=false;//TEST
_k_n=host_k_n;
_k_t=host_k_t;
_gamman=host_gamman;
_gammat=host_gammat;
_xmu=host_xmu;
_dampflag=host_dampflag;
_dt=host_dt;
UCL_H_Vec<int> h_mask;
_mask.alloc(_max_rad_size,*(this->ucl_device),UCL_READ_ONLY);
h_mask.view(host_mask,_max_rad_size,*(this->ucl_device));
ucl_copy(_mask,h_mask,false);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=sp_lj.row_bytes() + _mask.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void GranHookeT::clear() {
if (!_allocated)
return;
_allocated = false;
sp_lj.clear();
_mask.clear();
c_rad.clear();
c_omega.clear();
c_rmass.clear();
c_vel.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double GranHookeT::host_memory_usage() const {
return this->host_memory_usage_atomic() +
sizeof(GranHooke<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
int GranHookeT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors) +
sizeof(numtyp4)*2 + sizeof(numtyp2) + sizeof(int) +
sizeof(numtyp)*3 + sizeof(numtyp)*3*max_nbors;
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int GranHookeT::loop(const int _eflag, const int _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX = this->block_size();
//int GX = static_cast<int>(ceil(static_cast<double>(this->ans->inum()) / BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX, BX);
this->k_pair_fast.run(&this->atom->x, &c_rad, &c_vel,
&c_omega, &sp_lj, &c_rmass,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_mask, &this->ans->force,
&this->ans->engv, &_eflag, &_vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom,
&_k_n, &_k_t, &_gamman, &_gammat, &_xmu,
&_dampflag, &freeze_group_bit,
&_limit_damping, &_dt);
} else {
this->k_pair.set_size(GX, BX);
this->k_pair.run(&this->atom->x, &c_rad, &c_vel,
&c_omega, &sp_lj, &c_rmass,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_mask, &this->ans->force,
&this->ans->engv, &_eflag, &_vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom,
&_k_n, &_k_t, &_gamman, &_gammat, &_xmu,
&_dampflag, &freeze_group_bit,
&_limit_damping, &_dt);
}
this->time_pair.stop();
return GX;
}
template <class numtyp, class acctyp>
void GranHookeT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, tagint *tag,
double **host_v, double *host_rad,
double **host_omega, double *host_rmass, int limit_damping,
const int nlocal, double *boxlo, double *prd) {
this->acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
this->set_kernel(eflag,vflag);
_limit_damping = limit_damping;
// ------------------- Resize data array --------------------------
if (nall>_max_rad_size) {
_max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
if (!_shared_view) {
c_rad.resize(_max_rad_size);
rad_tex.bind_float(c_rad,1);
c_omega.resize(_max_rad_size);
omega_tex.bind_float(c_omega,4);
c_vel.resize(_max_rad_size);
vel_tex.bind_float(c_vel,4);
c_rmass.resize(_max_rad_size);
rmass_tex.bind_float(c_rmass,1);
}
}
// ----------------------------------------------------------------
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return;
}
// load balance, returning the atom count on the device (inum)
int ago=this->hd_balancer.ago_first(f_ago);
//this->hd_balancer.balance(cpu_time);
//int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
this->ans->inum(inum);
host_start=inum;
// -----------------------------------------------------------------
if (ago==0) {
this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
if (!success)
return;
}
this->atom->cast_x_data(host_x,host_type);
this->cast_vel_data(host_v);
this->cast_rad_data(host_rad);
this->cast_omega_data(host_omega);
this->cast_rmass_data(host_rmass);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
this->add_vel_data();
this->add_rad_data();
this->add_rmass_data();
this->add_omega_data();
const int red_blocks=this->loop(eflag,vflag);
this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
this->device->add_ans_object(this->ans);
this->hd_balancer.stop_timer();
}
template <class numtyp, class acctyp>
int** GranHookeT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success,
double **host_v, double *host_rad,
double **host_omega, double *host_rmass, int limit_damping,
double *boxlo, double *prd) {
this->acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
this->set_kernel(eflag,vflag);
_limit_damping= limit_damping;
// ------------------- Resize data array --------------------------
if (nall>_max_rad_size) {
_max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
if (!_shared_view) {
c_rad.resize(_max_rad_size);
rad_tex.bind_float(c_rad,1);
c_omega.resize(_max_rad_size);
omega_tex.bind_float(c_omega,4);
c_vel.resize(_max_rad_size);
vel_tex.bind_float(c_vel,4);
c_rmass.resize(_max_rad_size);
rmass_tex.bind_float(c_rmass,1);
}
}
// -----------------------------------------------------------------
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return nullptr;
}
// ----------------------------------------------------------------
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return nullptr;
}
// load balance, returning the atom count on the device (inum)
this->hd_balancer.balance(cpu_time);
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
this->ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return nullptr;
this->cast_vel_data(host_v);
this->cast_rad_data(host_rad);
this->cast_omega_data(host_omega);
this->cast_rmass_data(host_rmass);
this->hd_balancer.start_timer();
} else {
this->atom->cast_x_data(host_x,host_type);
this->cast_vel_data(host_v);
this->cast_rad_data(host_rad);
this->cast_omega_data(host_omega);
this->cast_rmass_data(host_rmass);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
}
this->add_vel_data();
this->add_rad_data();
this->add_rmass_data();
this->add_omega_data();
*ilist=this->nbor->host_ilist.begin();
*jnum=this->nbor->host_acc.begin();
const int red_blocks=this->loop(eflag,vflag);
this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
this->device->add_ans_object(this->ans);
this->hd_balancer.stop_timer();
return this->nbor->host_jlist.begin()-host_start;
}
// Instantiate the class
template class GranHooke<PRECISION,ACC_PRECISION>;

491
lib/gpu/lal_gran_hooke.cu Normal file
View File

@ -0,0 +1,491 @@
// **************************************************************************
// gran_hooke.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the gran/hooke pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
// 确保类型定义可见
#ifndef numtyp
#define numtyp float
#endif
#ifndef numtyp4
#define numtyp4 float4
#endif
#ifndef acctyp
#define acctyp float
#endif
#ifndef acctyp3
#define acctyp3 float3
#endif
#ifndef _DOUBLE_DOUBLE
_texture( pos_tex,float4);
_texture( rad_tex,float);
_texture( vel_tex,float4);
_texture( omega_tex,float4);
_texture( ramss_tex,float);
#else
_texture_2d( pos_tex,int4);
_texture( rad_tex,int2);
_texture_2d( vel_tex,int4);
_texture_2d( omega_tex,int4);
_texture( ramss_tex,int2);
#endif
#if (__CUDACC_VER_MAJOR__ >= 11)
#define rad_tex rad_
#define vel_tex v_
#define omega_tex omega_
#endif
#else
#define pos_tex x_
#define rad_tex rad_
#define vel_tex v_
#define omega_tex omega_
#define ramss_tex mass_rigid_
#define vel_rad_tex v_rad_
#define omega_ramss_tex omega_ramss_
#endif
#define MIN(A,B) ((A) < (B) ? (A) : (B))
#define store_answers_t(f, tor, energy, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_down(f.x, s, t_per_atom); \
f.y += shfl_down(f.y, s, t_per_atom); \
f.z += shfl_down(f.z, s, t_per_atom); \
tor.x += shfl_down(tor.x, s, t_per_atom); \
tor.y += shfl_down(tor.y, s, t_per_atom); \
tor.z += shfl_down(tor.z, s, t_per_atom); \
if (EVFLAG) energy += shfl_down(energy, s, t_per_atom); \
} \
if (EVFLAG && vflag) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_down(virial[r], s, t_per_atom); \
} \
} \
} \
if (offset==0 && ii<inum) { \
__global acctyp *ap1=engv+ii; \
if (EVFLAG && eflag) { \
*ap1=energy*(acctyp)0.5; \
ap1+=inum; \
} \
if (EVFLAG && vflag) { \
for (int i=0; i<6; i++) { \
*ap1=virial[i]*(acctyp)0.5; \
ap1+=inum; \
} \
} \
ans[ii]=f; \
ans[ii+inum]=tor; \
}
__kernel void k_gran_hooke(const __global numtyp4 *restrict x_, // 位置
const __global numtyp *restrict rad_, // 半径
const __global numtyp4 *restrict v_, // 速度
const __global numtyp4 *restrict omega_, // 角速度
const __global numtyp *restrict sp_lj_in, // 特殊LJ系数
const __global numtyp *restrict mass_rigid_, // 刚体质量
const __global int *dev_nbor, // 邻居列表
const __global int *dev_packed, // 打包的邻居
__global numtyp *restrict mask_, //
__global acctyp3 *restrict ans, // 力
__global acctyp *restrict engv, // 能量
const int eflag, const int vflag, // 能量/维里标志
const int inum, const int nbor_pitch, // 原子数/邻居间距
const int t_per_atom, // 每原子线程数
const numtyp kn, const numtyp kt, // 法向/切向刚度
const numtyp gamman, const numtyp gammat, // 法向/切向阻尼
const numtyp xmu, const int dampflag, // 摩擦系数/阻尼标志
const int freeze_group_bit, // 冻结组位
const int limit_damping, // damping限制标志位
const numtyp dt) { // 时间步长
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4];
int n_stride;
local_allocate_store_pair();
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp3 f, torque_i;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
torque_i.x=(acctyp)0; torque_i.y=(acctyp)0; torque_i.z=(acctyp)0;
acctyp energy, virial[6];
if (EVFLAG) {
energy=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex);
numtyp radi; fetch(radi,i,rad_tex);
numtyp rmassi; fetch(rmassi,i,mass_rigid_);
numtyp4 iv; fetch4(iv,i,vel_tex);
numtyp4 iomega; fetch4(iomega,i,omega_tex);
int maski = mask_[i];
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex);
numtyp radj; fetch(radj,j,rad_tex);
numtyp rmassj; fetch(rmassj,j,mass_rigid_);
int maskj = mask_[j];
if (factor_lj == 0.0) continue;
// 计算相对位置
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
numtyp radsum = radi + radj;
if (rsq < radsum*radsum)
{
numtyp r = ucl_sqrt(rsq);
numtyp rinv = ucl_recip(r);
numtyp rsqinv = ucl_recip(rsq);
// 相对平移速度
numtyp4 jv; fetch4(jv,j,vel_tex);
numtyp vr1 = iv.x-jv.x;
numtyp vr2 = iv.y-jv.y;
numtyp vr3 = iv.z-jv.z;
// 法向分量
numtyp vnnr = vr1*delx + vr2*dely + vr3*delz;
numtyp vn1 = delx*vnnr*rsqinv;
numtyp vn2 = dely*vnnr*rsqinv;
numtyp vn3 = delz*vnnr*rsqinv;
// 切向分量
numtyp vt1 = vr1 - vn1;
numtyp vt2 = vr2 - vn2;
numtyp vt3 = vr3 - vn3;
// 相对旋转速度
numtyp4 jomega; fetch4(jomega,j,omega_tex);
numtyp wr1 = (radi*iomega.x + radj*jomega.x)*rinv;
numtyp wr2 = (radi*iomega.y + radj*jomega.y)*rinv;
numtyp wr3 = (radi*iomega.z + radj*jomega.z)*rinv;
// 有效质量
numtyp mi = rmassi;
numtyp mj = rmassj;
numtyp meff = (mi*mj)/(mi+mj);
if (maski & freeze_group_bit) meff = mj;
if (maskj & freeze_group_bit) meff = mi;
// 法向力 = 弹性 + 阻尼
numtyp damp = meff*gamman*vnnr*rsqinv;
numtyp ccel = kn*(radsum-r)*rinv - damp;
if (limit_damping&&(ccel < (numtyp)0.0)) ccel = (numtyp)0.0;
// 切向速度(考虑旋转)
numtyp vtr1 = vt1 - (delz*wr2 - dely*wr3);
numtyp vtr2 = vt2 - (delx*wr3 - delz*wr1);
numtyp vtr3 = vt3 - (dely*wr1 - delx*wr2);
numtyp vrel = ucl_sqrt(vtr1*vtr1+vtr2*vtr2+vtr3*vtr3);
numtyp fn = xmu * ucl_abs(ccel*r);
numtyp fs = meff*gammat*vrel;
numtyp ft = 0.0;
if (vrel != 0.0) ft = MIN(fn,fs) / vrel;
numtyp fs1 = -ft*vtr1;
numtyp fs2 = -ft*vtr2;
numtyp fs3 = -ft*vtr3;
numtyp fx = delx*ccel + fs1;
numtyp fy = dely*ccel + fs2;
numtyp fz = delz*ccel + fs3;
fx = factor_lj*fx;
fy = factor_lj*fy;
fz = factor_lj*fz;
f.x+=fx;
f.y+=fy;
f.z+=fz;
// 力矩计算 (与CPU一致: torque = rinv * (r × fs))
numtyp tor1 = rinv * (dely*fs3 - delz*fs2);
numtyp tor2 = rinv * (delz*fs1 - delx*fs3);
numtyp tor3 = rinv * (delx*fs2 - dely*fs1);
tor1 *= factor_lj;
tor2 *= factor_lj;
tor3 *= factor_lj;
// 更新i粒子力矩 (负号表示作用反力)
torque_i.x -= radi * tor1;
torque_i.y -= radi * tor2;
torque_i.z -= radi * tor3;
if (EVFLAG && vflag) {
virial[0] += delx*fx;
virial[1] += dely*fy;
virial[2] += delz*fz;
virial[3] += delx*fy;
virial[4] += delx*fz;
virial[5] += dely*fz;
}
}
}
}
store_answers_t(f,torque_i,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
}
__kernel void k_gran_hooke_fast(const __global numtyp4 *restrict x_, // 位置
const __global numtyp *restrict rad_, // 半径
const __global numtyp4 *restrict v_, // 速度
const __global numtyp4 *restrict omega_, // 角速度
const __global numtyp *restrict sp_lj_in, // 特殊LJ系数
const __global numtyp *restrict mass_rigid_, // 刚体质量
const __global int *dev_nbor, // 邻居列表
const __global int *dev_packed, // 打包的邻居
__global numtyp *restrict mask_, //
__global acctyp3 *restrict ans, // 力
__global acctyp *restrict engv, // 能量
const int eflag, const int vflag, // 能量/维里标志
const int inum, const int nbor_pitch, // 原子数/邻居间距
const int t_per_atom, // 每原子线程数
const numtyp kn, const numtyp kt, // 法向/切向刚度
const numtyp gamman, const numtyp gammat, // 法向/切向阻尼
const numtyp xmu, const int dampflag, // 摩擦系数/阻尼标志
const int freeze_group_bit, // 冻结组位
const int limit_damping, // 剪切位移更新标志位
const numtyp dt) { // 时间步长
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
int n_stride;
local_allocate_store_pair();
acctyp3 f, torque_i;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
torque_i.x=(acctyp)0; torque_i.y=(acctyp)0; torque_i.z=(acctyp)0;
acctyp energy, virial[6];
if (EVFLAG) {
energy=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
__local numtyp sp_lj[4];
if (tid<4) {
sp_lj[tid]=sp_lj_in[tid];
}
//__local numtyp4 ixs[64];
__local numtyp4 ivs[64];
__local numtyp4 iomegas[64];
__local numtyp radis[64];
__local numtyp rmassis[64];
int idx=0;
idx=int(THREAD_ID_X/t_per_atom);
if(THREAD_ID_X%t_per_atom==0&&ii<inum)
{
//fetch4(ixs[idx],i,pos_tex);
fetch4(iomegas[idx],i,omega_tex);
fetch4(ivs[idx],i,vel_tex);
fetch(rmassis[idx],i,mass_rigid_);
fetch(radis[idx],i,rad_tex);
}
__syncthreads();
if (ii<inum) {
numtyp radi; radi=radis[idx];
numtyp rmassi; rmassi=rmassis[idx];
numtyp4 iomega; iomega=iomegas[idx];
numtyp4 iv; iv=ivs[idx];
/*
__syncthreads();
//numtyp4 ix; ix=ixs[idx];
numtyp radi; radi=radis[idx];
//numtyp rmassi; rmassi=rmassis[idx];
//numtyp4 iv; iv=ivs[idx];
//numtyp4 iomega; iomega=iomegas[idx];
*/
numtyp4 ix; fetch4(ix,i,pos_tex);
//numtyp radi; fetch(radi,i,rad_tex);
//numtyp rmassi; fetch(rmassi,i,mass_rigid_);
//numtyp4 iv; fetch4(iv,i,vel_tex);
//numtyp4 iomega; fetch4(iomega,i,omega_tex);
int maski = mask_[i];
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex);
numtyp radj; fetch(radj,j,rad_tex);
numtyp rmassj; fetch(rmassj,j,mass_rigid_);
int maskj = mask_[j];
if (factor_lj == 0.0) continue;
// 计算相对位置
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
numtyp radsum = radi + radj;
if (rsq < radsum*radsum)
{
numtyp r = ucl_sqrt(rsq);
numtyp rinv = ucl_recip(r);
numtyp rsqinv = ucl_recip(rsq);
// 相对平移速度
numtyp4 jv; fetch4(jv,j,vel_tex);
numtyp vr1 = iv.x-jv.x;
numtyp vr2 = iv.y-jv.y;
numtyp vr3 = iv.z-jv.z;
// 法向分量
numtyp vnnr = vr1*delx + vr2*dely + vr3*delz;
numtyp vn1 = delx*vnnr*rsqinv;
numtyp vn2 = dely*vnnr*rsqinv;
numtyp vn3 = delz*vnnr*rsqinv;
// 切向分量
numtyp vt1 = vr1 - vn1;
numtyp vt2 = vr2 - vn2;
numtyp vt3 = vr3 - vn3;
// 相对旋转速度
numtyp4 jomega; fetch4(jomega,j,omega_tex);
numtyp wr1 = (radi*iomega.x + radj*jomega.x)*rinv;
numtyp wr2 = (radi*iomega.y + radj*jomega.y)*rinv;
numtyp wr3 = (radi*iomega.z + radj*jomega.z)*rinv;
// 有效质量
numtyp mi = rmassi;
numtyp mj = rmassj;
numtyp meff = (mi*mj)/(mi+mj);
if (maski & freeze_group_bit) meff = mj;
if (maskj & freeze_group_bit) meff = mi;
// 法向力 = 弹性 + 阻尼
numtyp damp = meff*gamman*vnnr*rsqinv;
numtyp ccel = kn*(radsum-r)*rinv - damp;
if (limit_damping&&(ccel < (numtyp)0.0)) ccel = (numtyp)0.0;
// 切向速度(考虑旋转)
numtyp vtr1 = vt1 - (delz*wr2 - dely*wr3);
numtyp vtr2 = vt2 - (delx*wr3 - delz*wr1);
numtyp vtr3 = vt3 - (dely*wr1 - delx*wr2);
numtyp vrel = ucl_sqrt(vtr1*vtr1+vtr2*vtr2+vtr3*vtr3);
numtyp fn = xmu * ucl_abs(ccel*r);
numtyp fs = meff*gammat*vrel;
numtyp ft = 0.0;
if (vrel != 0.0) ft = MIN(fn,fs) / vrel;
numtyp fs1 = -ft*vtr1;
numtyp fs2 = -ft*vtr2;
numtyp fs3 = -ft*vtr3;
numtyp fx = delx*ccel + fs1;
numtyp fy = dely*ccel + fs2;
numtyp fz = delz*ccel + fs3;
fx = factor_lj*fx;
fy = factor_lj*fy;
fz = factor_lj*fz;
f.x+=fx;
f.y+=fy;
f.z+=fz;
// 力矩计算 (与CPU一致: torque = rinv * (r × fs))
numtyp tor1 = rinv * (dely*fs3 - delz*fs2);
numtyp tor2 = rinv * (delz*fs1 - delx*fs3);
numtyp tor3 = rinv * (delx*fs2 - dely*fs1);
tor1 *= factor_lj;
tor2 *= factor_lj;
tor3 *= factor_lj;
// 更新i粒子力矩 (负号表示作用反力)
torque_i.x -= radi * tor1;
torque_i.y -= radi * tor2;
torque_i.z -= radi * tor3;
if (EVFLAG && vflag) {
virial[0] += delx*fx;
virial[1] += dely*fy;
virial[2] += delz*fz;
virial[3] += delx*fy;
virial[4] += delx*fz;
virial[5] += dely*fz;
}
}
}
}
store_answers_t(f,torque_i,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
}

252
lib/gpu/lal_gran_hooke.h Normal file
View File

@ -0,0 +1,252 @@
/***************************************************************************
* gran_hooke.h
* -------------------
* Trung Dac Nguyen (ORNL)
*
* Class for acceleration of the gran/hooke pair style.
*
* __________________________________________________________________________
* This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
* __________________________________________________________________________
*
* begin :
* email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_GRAN_HOOKE_HISTORY_H
#define LAL_GRAN_HOOKE_HISTORY_H
#include "lal_base_sphere.h"
#include "lal_atom.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class GranHooke : public BaseSphere<numtyp, acctyp> {
public:
GranHooke();
~GranHooke();
typedef struct { double x,y,z; } vec3d;
typedef struct { numtyp x,y,z,w; } vec4d_t;
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes,
double host_k_n, double host_k_t,
double host_gamman, double host_gammat,
double host_xmu, double host_dt, const int host_dampflag,
double *host_special_lj, int *host_mask,
const int nlocal, const int nall, const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split, FILE *screen);
inline void cast_rad_data(double* rad) {
int nall = this->atom->nall();
if (_shared_view) {
c_rad.host.view((numtyp*)rad,nall,*(this->ucl_device));
c_rad.device.view(c_rad.host);
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(c_rad.host.begin(),rad,nall*sizeof(numtyp));
else
for (int i=0; i<nall; i++) c_rad[i]=rad[i];
}
}
// Copy rad to device asynchronously
inline void add_rad_data() {
c_rad.update_device(this->atom->nall(),true);
}
inline void cast_rmass_data(double* rmass) {
int nall = this->atom->nall();
if (_shared_view) {
c_rmass.host.view((numtyp*)rmass,nall,*(this->ucl_device));
c_rmass.device.view(c_rmass.host);
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(c_rmass.host.begin(),rmass,nall*sizeof(numtyp));
else
for (int i=0; i<nall; i++) c_rmass[i]=rmass[i];
}
}
// Copy rmass to device asynchronously
inline void add_rmass_data() {
c_rmass.update_device(this->atom->nall(),true);
}
/*inline void cast_omega_data(double** omega) {
int nall = this->atom->nall();
if (_shared_view) {
c_omega.host.view((numtyp*)omega,nall,*(this->ucl_device));
c_omega.device.view(c_omega.host);
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(c_omega.host.begin(),omega,nall*sizeof(numtyp));
else
for (int i=0; i<nall; i++)
{
c_omega[i]=omega[i];
}
}
}
// Copy rmass to device asynchronously
inline void add_omega_data() {
c_omega.update_device(this->atom->nall(),true);
}*/
inline void cast_omega_data(double** omega) {
int nall = this->atom->nall();
#ifdef GPU_CAST
memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int));
#else
vec3d *host_p=reinterpret_cast<vec3d*>(&(omega[0][0]));
vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(c_omega[0]));
#if (LAL_USE_OMP == 1)
#pragma omp parallel for schedule(static)
#endif
for (int i=0; i<nall; i++) {
vp[i].x=host_p[i].x;
vp[i].y=host_p[i].y;
vp[i].z=host_p[i].z;
vp[i].w=0.0;
}
#endif
}
// Copy rmass to device asynchronously
inline void add_omega_data() {
c_omega.update_device(this->atom->nall()*4,true);
}
inline void cast_vel_data(double** vel) {
int nall = this->atom->nall();
#ifdef GPU_CAST
memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int));
#else
vec3d *host_p=reinterpret_cast<vec3d*>(&(vel[0][0]));
vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(c_vel[0]));
#if (LAL_USE_OMP == 1)
#pragma omp parallel for schedule(static)
#endif
for (int i=0; i<nall; i++) {
vp[i].x=host_p[i].x;
vp[i].y=host_p[i].y;
vp[i].z=host_p[i].z;
vp[i].w=0.0;
}
#endif
}
// Copy rmass to device asynchronously
inline void add_vel_data() {
c_vel.update_device(this->atom->nall()*4,true);
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
void compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, tagint *tag,
double **host_v, double *host_rad,
double **host_omega, double *host_rmass, int limit_damping,
const int nlocal, double *boxlo, double *prd);
int **compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success,
double **host_v, double *host_rad,
double **host_omega, double *host_rmass, int limit_damping,
double *boxlo, double *prd);
// --------------------------- TYPE DATA --------------------------
/// coeff.x = k_n, coeff.y = k_t, coeff.z = cutsq, coeff.w = gamman
UCL_D_Vec<numtyp4> coeff;
numtyp _k_n;
numtyp _k_t;
numtyp _gamman;
numtyp _gammat;
numtyp _xmu;
numtyp _dt;
numtyp _dampflag;
int _limit_damping;
/// coeff2.x = gammat, coeff2.y = xmu
UCL_D_Vec<numtyp2> coeff2;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
UCL_D_Vec<numtyp> _mask;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
/// Per-atom arrays
UCL_Vector<numtyp,numtyp> c_rad;
UCL_Vector<numtyp,numtyp> c_omega;
UCL_Vector<numtyp,numtyp> c_vel;
UCL_Vector<numtyp,numtyp> c_rmass;
// --------------------------- TEXTURES -----------------------------
UCL_Texture rad_tex;
UCL_Texture omega_tex;
UCL_Texture vel_tex;
UCL_Texture rmass_tex;
numtyp freeze_group_bit;
int _max_rad_size;
private:
bool _shared_view;
bool _allocated;
int loop(const int _eflag, const int _vflag);
};
}
#endif

View File

@ -0,0 +1,134 @@
/***************************************************************************
* gran_hooke_ext.cpp
* -------------------
* Trung Dac Nguyen (ORNL)
*
* Functions for LAMMPS access to gran/hooke acceleration routines.
*
* __________________________________________________________________________
* This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
* __________________________________________________________________________
*
* begin :
* email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <cmath>
#include "lal_gran_hooke.h"
using namespace std;
using namespace LAMMPS_AL;
static GranHooke<PRECISION,ACC_PRECISION> GHM;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int gran_hooke_gpu_init(const int ntypes, double **cutsq,
const double host_k_n, const double host_k_t,
const double host_gamman, const double host_gammat,
const double host_xmu, const double host_dt, const int host_dampflag,
double *special_lj, int *host_mask,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen) {
GHM.clear();
gpu_mode=GHM.device->gpu_mode();
double gpu_split=GHM.device->particle_split();
int first_gpu=GHM.device->first_device();
int last_gpu=GHM.device->last_device();
int world_me=GHM.device->world_me();
int gpu_rank=GHM.device->gpu_rank();
int procs_per_gpu=GHM.device->procs_per_gpu();
GHM.device->init_message(screen,"gran/hooke",first_gpu,last_gpu);
bool message=false;
if (GHM.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=GHM.init(ntypes, host_k_n, host_k_t, host_gamman,
host_gammat, host_xmu, host_dt, host_dampflag, special_lj,
host_mask, inum, nall, max_nbors, maxspecial, cell_size,
gpu_split, screen);
GHM.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=GHM.init(ntypes, host_k_n, host_k_t, host_gamman,
host_gammat, host_xmu, host_dt, host_dampflag, special_lj,
host_mask, inum, nall, max_nbors, maxspecial, cell_size,
gpu_split, screen);
GHM.device->serialize_init();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
GHM.estimate_gpu_overhead();
return init_ok;
}
void gran_hooke_gpu_clear() {
GHM.clear();
}
int ** gran_hooke_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success,
double **host_v, double *host_rad,
double **host_omega, double *host_rmass, int limit_damping,
double *boxlo, double *prd) {
return GHM.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
tag, nspecial, special, eflag, vflag, eatom, vatom,
host_start, ilist, jnum, cpu_time, success, host_v, host_rad,
host_omega, host_rmass, limit_damping, boxlo, prd);
}
void gran_hooke_gpu_compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, tagint *tag,
double **host_v, double *host_rad,
double **host_omega, double *host_rmass, int limit_damping,
const int nlocal, double *boxlo, double *prd) {
GHM.compute(ago, inum_full, nall, host_x, host_type, ilist, numj,
firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time,
success, tag, host_v, host_rad, host_omega, host_rmass, limit_damping, nlocal, boxlo, prd);
}
double gran_hooke_gpu_bytes() {
return GHM.host_memory_usage();
}

View File

@ -0,0 +1,477 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
LAMMPS development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Trung Dac Nguyen (ORNL)
------------------------------------------------------------------------- */
#include "pair_gran_hooke_gpu.h"
#include "atom.h"
#include "domain.h"
#include "error.h"
#include "force.h"
#include "gpu_extra.h"
#include "neigh_list.h"
#include "neighbor.h"
#include "suffix.h"
#include "update.h"
#include "memory.h"
#include "comm.h"
#include <cmath>
using namespace LAMMPS_NS;
// External functions from cuda library for atom decomposition
int gran_hooke_gpu_init(const int ntypes, double **cutsq,
const double host_k_n, const double host_k_t,
const double host_gamman, const double host_gammat,
const double host_xmu, const double host_dt, const int host_dampflag,
double *special_lj, int *host_mask,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen);
void gran_hooke_gpu_clear();
int ** gran_hooke_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success,
double **host_v, double *host_rad,
double **host_omega, double *host_rmass, int limit_damping,
double *boxlo, double *prd);
void gran_hooke_gpu_compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, tagint *tag,
double **host_v, double *host_rad,
double **host_omega, double *host_rmass, int limit_damping,
const int nlocal, double *boxlo, double *prd);
double gran_hooke_gpu_bytes();
/* ---------------------------------------------------------------------- */
PairGranHookeGPU::PairGranHookeGPU(LAMMPS *lmp) :
PairGranHooke(lmp), gpu_mode(GPU_FORCE)
{
respa_enable = 0;
reinitflag = 0;
cpu_time = 0.0;
suffix_flag |= Suffix::GPU;
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairGranHookeGPU::~PairGranHookeGPU()
{
gran_hooke_gpu_clear();
}
/* ---------------------------------------------------------------------- */
void PairGranHookeGPU::compute(int eflag, int vflag)
{
ev_init(eflag, vflag);
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
int *ilist, *numneigh, **firstneigh;
int shearupdate = 1;
if (update->setupflag) shearupdate = 0;
if (gpu_mode != GPU_FORCE) {
double sublo[3], subhi[3];
if (domain->triclinic == 0) {
sublo[0] = domain->sublo[0];
sublo[1] = domain->sublo[1];
sublo[2] = domain->sublo[2];
subhi[0] = domain->subhi[0];
subhi[1] = domain->subhi[1];
subhi[2] = domain->subhi[2];
} else {
domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi);
}
inum = atom->nlocal;
firstneigh = gran_hooke_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type,
sublo, subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, &ilist, &numneigh,
cpu_time, success, atom->v,atom->radius, atom->omega, atom->rmass,
limit_damping, domain->boxlo, domain->prd);
} else {
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
gran_hooke_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh,
firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start,
cpu_time, success, atom->tag, atom->v, atom->radius, atom->omega, atom->rmass,
limit_damping, atom->nlocal, domain->boxlo, domain->prd);
}
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
neighbor->build_topology();
if (host_start < inum) {
cpu_time = platform::walltime();
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
cpu_time = platform::walltime() - cpu_time;
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairGranHookeGPU::init_style()
{
if (!atom->radius_flag || !atom->rmass_flag || !atom->omega_flag)
error->all(FLERR, "Pair style gran/hooke/history/gpu requires atom attributes radius, rmass, omega");
if (comm->ghost_velocity == 0)
error->all(FLERR, "Pair gran/h* requires ghost atoms store velocity");
if (gpu_mode == GPU_FORCE)
{
if (history)
neighbor->add_request(this, NeighConst::REQ_SIZE | NeighConst::REQ_HISTORY);
else
neighbor->add_request(this, NeighConst::REQ_SIZE);
}
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i, j);
cut *= cut;
if (cut > maxcut) maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
double cell_size = sqrt(maxcut) + neighbor->skin;
int maxspecial = 0;
if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial;
int mnf = 5e-2 * neighbor->oneatom;
//dt = update->dt;
if (history && (fix_history == nullptr)) {
auto cmd = fmt::format("NEIGH_HISTORY_HH{} all NEIGH_HISTORY {}", instance_me, size_history);
fix_history = dynamic_cast<FixNeighHistory *>(
modify->replace_fix("NEIGH_HISTORY_HH_DUMMY" + std::to_string(instance_me), cmd, 1));
fix_history->pair = this;
}
// check for FixFreeze and set freeze_group_bit
auto fixlist = modify->get_fix_by_style("^freeze");
if (fixlist.size() == 0)
freeze_group_bit = 0;
else if (fixlist.size() > 1)
error->all(FLERR, "Only one fix freeze command at a time allowed");
else
freeze_group_bit = fixlist.front()->groupbit;
// check for FixRigid so can extract rigid body masses
fix_rigid = nullptr;
for (const auto &ifix : modify->get_fix_list()) {
if (ifix->rigid_flag) {
if (fix_rigid)
error->all(FLERR, "Only one fix rigid command at a time allowed");
else
fix_rigid = ifix;
}
}
// check for FixPour and FixDeposit so can extract particle radii
auto pours = modify->get_fix_by_style("^pour");
auto deps = modify->get_fix_by_style("^deposit");
// set maxrad_dynamic and maxrad_frozen for each type
// include future FixPour and FixDeposit particles as dynamic
int itype;
for (int i = 1; i <= atom->ntypes; i++) {
onerad_dynamic[i] = onerad_frozen[i] = 0.0;
for (auto &ipour : pours) {
itype = i;
double maxrad = *((double *) ipour->extract("radius", itype));
if (maxrad > 0.0) onerad_dynamic[i] = maxrad;
}
for (auto &idep : deps) {
itype = i;
double maxrad = *((double *) idep->extract("radius", itype));
if (maxrad > 0.0) onerad_dynamic[i] = maxrad;
}
}
double *radius = atom->radius;
int *mask = atom->mask;
int *type = atom->type;
int nlocal = atom->nlocal;
for (int i = 0; i < nlocal; i++) {
if (mask[i] & freeze_group_bit)
onerad_frozen[type[i]] = MAX(onerad_frozen[type[i]], radius[i]);
else
onerad_dynamic[type[i]] = MAX(onerad_dynamic[type[i]], radius[i]);
}
MPI_Allreduce(&onerad_dynamic[1], &maxrad_dynamic[1], atom->ntypes, MPI_DOUBLE, MPI_MAX, world);
MPI_Allreduce(&onerad_frozen[1], &maxrad_frozen[1], atom->ntypes, MPI_DOUBLE, MPI_MAX, world);
// set fix which stores history info
if (history) {
fix_history = dynamic_cast<FixNeighHistory *>(
modify->get_fix_by_id("NEIGH_HISTORY_HH" + std::to_string(instance_me)));
if (!fix_history) error->all(FLERR, "Could not find pair fix neigh history ID");
}
int success = gran_hooke_gpu_init(atom->ntypes + 1, cutsq, kn, kt, gamman, gammat, xmu, update->dt,
dampflag, force->special_lj, atom->mask,
atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial,
cell_size, gpu_mode, screen);
GPU_EXTRA::check_flag(success, error, world);
}
void PairGranHookeGPU::cpu_compute(int start, int inum, int eflag, int vflag, int *ilist,
int *numneigh, int **firstneigh)
{
int i, j, ii, jj, jnum;
double xtmp, ytmp, ztmp, delx, dely, delz, fx, fy, fz;
double radi, radj, radsum, rsq, r, rinv, rsqinv, factor_lj;
double vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3;
double wr1, wr2, wr3;
double vtr1, vtr2, vtr3, vrel;
double mi, mj, meff, damp, ccel, tor1, tor2, tor3;
double fn,fs,ft,fs1,fs2,fs3;
double shrmag, rsht;
int *jlist;
int *touch, **firsttouch;
double *shear, *allshear, **firstshear;
ev_init(eflag, vflag);
int shearupdate = 1;
if (update->setupflag) shearupdate = 0;
// update rigid body info for owned & ghost atoms if using FixRigid masses
// body[i] = which body atom I is in, -1 if none
// mass_body = mass of each rigid body
if (fix_rigid && neighbor->ago == 0) {
int tmp;
int *body = (int *) fix_rigid->extract("body", tmp);
auto mass_body = (double *) fix_rigid->extract("masstotal", tmp);
if (atom->nmax > nmax) {
memory->destroy(mass_rigid);
nmax = atom->nmax;
memory->create(mass_rigid, nmax, "pair:mass_rigid");
}
int nlocal = atom->nlocal;
for (i = 0; i < nlocal; i++)
if (body[i] >= 0)
mass_rigid[i] = mass_body[body[i]];
else
mass_rigid[i] = 0.0;
comm->forward_comm(this);
}
double **x = atom->x;
double **v = atom->v;
double **f = atom->f;
double **omega = atom->omega;
double **torque = atom->torque;
double *radius = atom->radius;
double *rmass = atom->rmass;
int *mask = atom->mask;
int nlocal = atom->nlocal;
int newton_pair = force->newton_pair;
double *special_lj = force->special_lj;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
firsttouch = fix_history->firstflag;
firstshear = fix_history->firstvalue;
// loop over neighbors of my atoms
for (ii = 0; ii < inum; ii++) {
i = ilist[ii];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
radi = radius[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
factor_lj = special_lj[sbmask(j)];
j &= NEIGHMASK;
if (factor_lj == 0) continue;
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
radj = radius[j];
radsum = radi + radj;
if (rsq < radsum*radsum) {
r = sqrt(rsq);
rinv = 1.0/r;
rsqinv = 1.0/rsq;
// relative translational velocity
vr1 = v[i][0] - v[j][0];
vr2 = v[i][1] - v[j][1];
vr3 = v[i][2] - v[j][2];
// normal component
vnnr = vr1*delx + vr2*dely + vr3*delz;
vn1 = delx*vnnr * rsqinv;
vn2 = dely*vnnr * rsqinv;
vn3 = delz*vnnr * rsqinv;
// tangential component
vt1 = vr1 - vn1;
vt2 = vr2 - vn2;
vt3 = vr3 - vn3;
// relative rotational velocity
wr1 = (radi*omega[i][0] + radj*omega[j][0]) * rinv;
wr2 = (radi*omega[i][1] + radj*omega[j][1]) * rinv;
wr3 = (radi*omega[i][2] + radj*omega[j][2]) * rinv;
// meff = effective mass of pair of particles
// if I or J part of rigid body, use body mass
// if I or J is frozen, meff is other particle
mi = rmass[i];
mj = rmass[j];
if (fix_rigid) {
if (mass_rigid[i] > 0.0) mi = mass_rigid[i];
if (mass_rigid[j] > 0.0) mj = mass_rigid[j];
}
meff = mi*mj / (mi+mj);
if (mask[i] & freeze_group_bit) meff = mj;
if (mask[j] & freeze_group_bit) meff = mi;
// normal forces = Hookian contact + normal velocity damping
damp = meff*gamman*vnnr*rsqinv;
ccel = kn*(radsum-r)*rinv - damp;
if (limit_damping && (ccel < 0.0)) ccel = 0.0;
// relative velocities
vtr1 = vt1 - (delz*wr2-dely*wr3);
vtr2 = vt2 - (delx*wr3-delz*wr1);
vtr3 = vt3 - (dely*wr1-delx*wr2);
vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
vrel = sqrt(vrel);
// force normalization
fn = xmu * fabs(ccel*r);
fs = meff*gammat*vrel;
if (vrel != 0.0) ft = MIN(fn,fs) / vrel;
else ft = 0.0;
// tangential force due to tangential velocity damping
fs1 = -ft*vtr1;
fs2 = -ft*vtr2;
fs3 = -ft*vtr3;
// forces & torques
fx = delx*ccel + fs1;
fy = dely*ccel + fs2;
fz = delz*ccel + fs3;
fx *= factor_lj;
fy *= factor_lj;
fz *= factor_lj;
f[i][0] += fx;
f[i][1] += fy;
f[i][2] += fz;
tor1 = rinv * (dely*fs3 - delz*fs2);
tor2 = rinv * (delz*fs1 - delx*fs3);
tor3 = rinv * (delx*fs2 - dely*fs1);
tor1 *= factor_lj;
tor2 *= factor_lj;
tor3 *= factor_lj;
torque[i][0] -= radi*tor1;
torque[i][1] -= radi*tor2;
torque[i][2] -= radi*tor3;
if (newton_pair || j < nlocal) {
f[j][0] -= fx;
f[j][1] -= fy;
f[j][2] -= fz;
torque[j][0] -= radj*tor1;
torque[j][1] -= radj*tor2;
torque[j][2] -= radj*tor3;
}
if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
0.0,0.0,fx,fy,fz,delx,dely,delz);
}
}
}
if (vflag_fdotr) virial_fdotr_compute();
}
/* ---------------------------------------------------------------------- */
double PairGranHookeGPU::memory_usage()
{
double bytes = Pair::memory_usage();
return bytes + gran_hooke_gpu_bytes();
}

View File

@ -0,0 +1,46 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
LAMMPS development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
// clang-format off
PairStyle(gran/hooke/gpu,PairGranHookeGPU);
// clang-format on
#else
#ifndef LMP_PAIR_GRAN_HOOKE_GPU_H
#define LMP_PAIR_GRAN_HOOKE_GPU_H
#include "pair_gran_hooke.h"
#include "fix_neigh_history.h"
namespace LAMMPS_NS {
class PairGranHookeGPU : public PairGranHooke {
public:
PairGranHookeGPU(LAMMPS *lmp);
~PairGranHookeGPU() override;
void cpu_compute(int, int, int, int, int *, int *, int **);
void compute(int, int) override;
void init_style() override;
double memory_usage() override;
enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
private:
int gpu_mode;
double cpu_time;
};
} // namespace LAMMPS_NS
#endif
#endif