feat(gpu/pair): 添加 Hooke 颗粒力模型的 GPU 加速支持
实现了颗粒间 Hooke 力模型的 GPU 加速计算,主要包含以下内容: * 引入基础类 BaseSphere 处理球形颗粒的位置、半径和角速度数据 * 新增 GranHooke 类实现法向/切向弹性力与阻尼力的 GPU 计算逻辑 * 使用纹理内存优化颗粒属性访问(半径、质量、速度、角速度) * 实现设备端邻居列表构建和并行力计算内核 * 支持与 LAMMPS 的 FixNeighHistory 集成处理接触历史数据 * 添加 GPU 与 CPU 数据转换接口,支持混合粒子分组和冻结状态处理 * 整合到 LAMMPS 的 GPU 模块,可通过 pair_style gran/hooke/gpu 调用
This commit is contained in:
331
lib/gpu/lal_base_sphere.cpp
Normal file
331
lib/gpu/lal_base_sphere.cpp
Normal file
@ -0,0 +1,331 @@
|
||||
/***************************************************************************
|
||||
base_sphere.cpp
|
||||
-------------------
|
||||
Trung Dac Nguyen (ORNL)
|
||||
|
||||
Base class for pair styles needing per-particle data for position,
|
||||
radius, angular velocity, and type.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#include "lal_base_sphere.h"
|
||||
namespace LAMMPS_AL {
|
||||
#define BaseSphereT BaseSphere<numtyp, acctyp>
|
||||
|
||||
extern Device<PRECISION,ACC_PRECISION> global_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BaseSphereT::BaseSphere() : _compiled(false), _max_bytes(0) {
|
||||
device=&global_device;
|
||||
ans=new Answer<numtyp,acctyp>();
|
||||
nbor=new Neighbor();
|
||||
pair_program=nullptr;
|
||||
ucl_device=nullptr;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
pair_program_noev=nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BaseSphereT::~BaseSphere() {
|
||||
delete ans;
|
||||
delete nbor;
|
||||
k_pair_fast.clear();
|
||||
k_pair.clear();
|
||||
if (pair_program) delete pair_program;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
k_pair_noev.clear();
|
||||
if (pair_program_noev) delete pair_program_noev;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseSphereT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
|
||||
nbor->bytes_per_atom(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseSphereT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size, const double gpu_split,
|
||||
FILE *_screen, const void *pair_program,
|
||||
const char *k_name) {
|
||||
screen=_screen;
|
||||
|
||||
int gpu_nbor=0;
|
||||
if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
|
||||
gpu_nbor=1;
|
||||
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
|
||||
gpu_nbor=2;
|
||||
|
||||
int _gpu_host=0;
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
|
||||
bool charge = false;
|
||||
bool rot = true;
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
if (ucl_device!=device->gpu) _compiled=false;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
|
||||
_block_size=device->pair_block_size();
|
||||
compile_kernels(*ucl_device,pair_program,k_name);
|
||||
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
|
||||
time_pair.init(*ucl_device);
|
||||
time_pair.zero();
|
||||
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
|
||||
|
||||
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseSphereT::estimate_gpu_overhead(const int add_kernels) {
|
||||
device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseSphereT::clear_atomic() {
|
||||
acc_timers();
|
||||
double avg_split=hd_balancer.all_avg_split();
|
||||
_gpu_overhead*=hd_balancer.timestep();
|
||||
_driver_overhead*=hd_balancer.timestep();
|
||||
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
|
||||
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
|
||||
|
||||
time_pair.clear();
|
||||
hd_balancer.clear();
|
||||
|
||||
nbor->clear();
|
||||
ans->clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int * BaseSphereT::reset_nbors(const int nall, const int inum, int *ilist,
|
||||
int *numj, int **firstneigh, bool &success) {
|
||||
success=true;
|
||||
|
||||
int mn=nbor->max_nbor_loop(inum,numj,ilist);
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,mn,success);
|
||||
if (!success)
|
||||
return nullptr;
|
||||
|
||||
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
|
||||
return ilist;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseSphereT::build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
bool &success) {
|
||||
success=true;
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,host_inum,nbor->max_nbors(),success);
|
||||
if (!success)
|
||||
return;
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
|
||||
tag, nspecial, special, success, mn, ans->error_flag);
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseSphereT::compute(const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, tagint *tag,
|
||||
const int nlocal,
|
||||
double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
int eflags = eflag ? 1 : 0;
|
||||
int vflags = vflag ? 1 : 0;
|
||||
if (eatom) eflags=2;
|
||||
if (vatom) vflags=2;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflags) eflags=2;
|
||||
if (vflags) vflags=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflags, vflags);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
resize_atom(0,nall,success);
|
||||
zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
reset_nbors(nall, inum, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
}
|
||||
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
|
||||
const int red_blocks=loop(eflags, vflags);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom,ilist,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int **BaseSphereT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
int eflags = eflag ? 1 : 0;
|
||||
int vflags = vflag ? 1 : 0;
|
||||
if (eatom) eflags=2;
|
||||
if (vatom) vflags=2;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflags) eflags=2;
|
||||
if (vflags) vflags=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflags, vflags);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
resize_atom(0,nall,success);
|
||||
zero_timers();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
hd_balancer.balance(cpu_time);
|
||||
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return nullptr;
|
||||
hd_balancer.start_timer();
|
||||
} else {
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
|
||||
|
||||
*ilist=nbor->host_ilist.begin();
|
||||
*jnum=nbor->host_acc.begin();
|
||||
|
||||
const int red_blocks=loop(eflags, vflags);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double BaseSphereT::host_memory_usage_atomic() const {
|
||||
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
||||
4*sizeof(numtyp)+sizeof(BaseSphere<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseSphereT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
const char *k) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
std::string s_fast=std::string(k)+"_fast";
|
||||
if (pair_program) delete pair_program;
|
||||
pair_program=new UCL_Program(dev);
|
||||
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
||||
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
||||
k_pair.set_function(*pair_program,k);
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
oclstring = device->compile_string()+" -DEVFLAG=0";
|
||||
if (pair_program_noev) delete pair_program_noev;
|
||||
pair_program_noev=new UCL_Program(dev);
|
||||
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
|
||||
#else
|
||||
k_pair_sel = &k_pair_fast;
|
||||
#endif
|
||||
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.has_subgroup_support()) {
|
||||
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
|
||||
#endif
|
||||
if (_threads_per_atom > (int)mx_subgroup_sz) _threads_per_atom = mx_subgroup_sz;
|
||||
device->set_simd_size(mx_subgroup_sz);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template class BaseSphere<PRECISION,ACC_PRECISION>;
|
||||
}
|
||||
176
lib/gpu/lal_base_sphere.h
Normal file
176
lib/gpu/lal_base_sphere.h
Normal file
@ -0,0 +1,176 @@
|
||||
/***************************************************************************
|
||||
base_sphere.h
|
||||
-------------------
|
||||
Trung Dac Nguyen (ORNL)
|
||||
|
||||
Base class for pair styles needing per-particle data for position,
|
||||
radius, angular velocity, and type.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef LAL_BASE_SPHERE_H
|
||||
#define LAL_BASE_SPHERE_H
|
||||
|
||||
#include "lal_device.h"
|
||||
#include "lal_balance.h"
|
||||
#include "mpi.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_HIP)
|
||||
#include "geryon/hip_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class BaseSphere {
|
||||
public:
|
||||
BaseSphere();
|
||||
virtual ~BaseSphere();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const void *pair_program, const char *k_name);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead(const int add_kernels=0);
|
||||
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(nall, success)) {
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
}
|
||||
ans->resize(inum,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
inline void resize_local(const int inum, const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,max_nbors,success);
|
||||
}
|
||||
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,host_inum,max_nbors,success);
|
||||
}
|
||||
|
||||
/// Clear all host and device data
|
||||
void clear_atomic();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom_atomic(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage_atomic() const;
|
||||
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (device->time_device()) {
|
||||
nbor->acc_timers(screen);
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero timers
|
||||
inline void zero_timers() {
|
||||
time_pair.zero();
|
||||
atom->zero_timers();
|
||||
ans->zero_timers();
|
||||
}
|
||||
|
||||
/// Copy neighbor list from host
|
||||
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
|
||||
int **firstneigh, bool &success);
|
||||
|
||||
/// Build neighbor list on device
|
||||
void build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, bool &success);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int f_ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, tagint *tag,
|
||||
const int nlocal, double *boxlo, double *prd);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int **compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||
double *boxlo, double *prd);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
Device<numtyp,acctyp> *device;
|
||||
UCL_Device *ucl_device;
|
||||
UCL_Timer time_pair;
|
||||
Balance<numtyp,acctyp> hd_balancer;
|
||||
FILE *screen;
|
||||
|
||||
// --------------------------- ATOM DATA --------------------------
|
||||
Atom<numtyp,acctyp> *atom;
|
||||
|
||||
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||
Answer<numtyp,acctyp> *ans;
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
Neighbor *nbor;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program, *pair_program_noev;
|
||||
UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
|
||||
inline int block_size() { return _block_size; }
|
||||
inline void set_kernel(const int eflag, const int vflag) {
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
if (eflag || vflag) k_pair_sel = &k_pair_fast;
|
||||
else k_pair_sel = &k_pair_noev;
|
||||
#endif
|
||||
}
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex;
|
||||
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_size, _threads_per_atom;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
|
||||
|
||||
virtual int loop(const int eflag, const int vflag) = 0;
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
||||
391
lib/gpu/lal_gran_hooke.cpp
Normal file
391
lib/gpu/lal_gran_hooke.cpp
Normal file
@ -0,0 +1,391 @@
|
||||
/***************************************************************************
|
||||
* gran_hooke.cpp
|
||||
* -------------------
|
||||
* Trung Dac Nguyen (ORNL)
|
||||
*
|
||||
* Class for acceleration of the gran/hooke pair style.
|
||||
*
|
||||
* __________________________________________________________________________
|
||||
* This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
* __________________________________________________________________________
|
||||
*
|
||||
* begin :
|
||||
* email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#if defined(USE_OPENCL)
|
||||
#include "gran_hooke_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *gran_hooke=0;
|
||||
#else
|
||||
#include "gran_hooke_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_gran_hooke.h"
|
||||
#include <cassert>
|
||||
using namespace LAMMPS_AL;
|
||||
#define GranHookeT GranHooke<numtyp, acctyp>
|
||||
|
||||
extern Device<PRECISION,ACC_PRECISION> device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
GranHookeT::GranHooke() : BaseSphere<numtyp,acctyp>(),
|
||||
_allocated(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
GranHookeT::~GranHooke() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int GranHookeT::init(const int ntypes,
|
||||
double host_k_n, double host_k_t,
|
||||
double host_gamman, double host_gammat,
|
||||
double host_xmu, double host_dt, const int host_dampflag,
|
||||
double *host_special_lj, int *host_mask,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen) {
|
||||
int onetype=0;
|
||||
int success;
|
||||
success = this->init_atomic(nlocal, nall, max_nbors, maxspecial,
|
||||
cell_size, gpu_split, screen,
|
||||
gran_hooke, "k_gran_hooke");
|
||||
if (success != 0)
|
||||
return success;
|
||||
|
||||
_shared_view = this->ucl_device->shared_memory() && sizeof(numtyp)==sizeof(double);
|
||||
|
||||
|
||||
// allocate rad
|
||||
|
||||
int ef_nall=nall;
|
||||
if (ef_nall==0)
|
||||
ef_nall=2000;
|
||||
|
||||
_max_rad_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
|
||||
|
||||
if (!_shared_view)
|
||||
{
|
||||
c_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
|
||||
c_rmass.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
|
||||
c_omega.alloc(_max_rad_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
|
||||
c_vel.alloc(_max_rad_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
|
||||
}
|
||||
|
||||
rad_tex.get_texture(*(this->pair_program),"rad_tex");
|
||||
rad_tex.bind_float(c_rad,1);
|
||||
|
||||
rmass_tex.get_texture(*(this->pair_program),"rmass_tex");
|
||||
rmass_tex.bind_float(c_rmass,1);
|
||||
|
||||
omega_tex.get_texture(*(this->pair_program),"omega_tex");
|
||||
omega_tex.bind_float(c_omega,4);
|
||||
|
||||
vel_tex.get_texture(*(this->pair_program),"vel_tex");
|
||||
vel_tex.bind_float(c_vel,4);
|
||||
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
int max_shared_types=this->device->max_shared_types();
|
||||
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {//TODO
|
||||
lj_types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
//shared_types=false;//TEST
|
||||
_k_n=host_k_n;
|
||||
_k_t=host_k_t;
|
||||
_gamman=host_gamman;
|
||||
_gammat=host_gammat;
|
||||
_xmu=host_xmu;
|
||||
_dampflag=host_dampflag;
|
||||
_dt=host_dt;
|
||||
|
||||
UCL_H_Vec<int> h_mask;
|
||||
_mask.alloc(_max_rad_size,*(this->ucl_device),UCL_READ_ONLY);
|
||||
h_mask.view(host_mask,_max_rad_size,*(this->ucl_device));
|
||||
ucl_copy(_mask,h_mask,false);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dview.view(host_special_lj,4,*(this->ucl_device));
|
||||
ucl_copy(sp_lj,dview,false);
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=sp_lj.row_bytes() + _mask.row_bytes();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void GranHookeT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
|
||||
_allocated = false;
|
||||
sp_lj.clear();
|
||||
_mask.clear();
|
||||
c_rad.clear();
|
||||
c_omega.clear();
|
||||
c_rmass.clear();
|
||||
c_vel.clear();
|
||||
this->clear_atomic();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double GranHookeT::host_memory_usage() const {
|
||||
return this->host_memory_usage_atomic() +
|
||||
sizeof(GranHooke<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int GranHookeT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors) +
|
||||
sizeof(numtyp4)*2 + sizeof(numtyp2) + sizeof(int) +
|
||||
sizeof(numtyp)*3 + sizeof(numtyp)*3*max_nbors;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int GranHookeT::loop(const int _eflag, const int _vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX = this->block_size();
|
||||
//int GX = static_cast<int>(ceil(static_cast<double>(this->ans->inum()) / BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int ainum=this->ans->inum();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX, BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &c_rad, &c_vel,
|
||||
&c_omega, &sp_lj, &c_rmass,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_mask, &this->ans->force,
|
||||
&this->ans->engv, &_eflag, &_vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom,
|
||||
&_k_n, &_k_t, &_gamman, &_gammat, &_xmu,
|
||||
&_dampflag, &freeze_group_bit,
|
||||
&_limit_damping, &_dt);
|
||||
|
||||
} else {
|
||||
this->k_pair.set_size(GX, BX);
|
||||
this->k_pair.run(&this->atom->x, &c_rad, &c_vel,
|
||||
&c_omega, &sp_lj, &c_rmass,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_mask, &this->ans->force,
|
||||
&this->ans->engv, &_eflag, &_vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom,
|
||||
&_k_n, &_k_t, &_gamman, &_gammat, &_xmu,
|
||||
&_dampflag, &freeze_group_bit,
|
||||
&_limit_damping, &_dt);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void GranHookeT::compute(const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, tagint *tag,
|
||||
double **host_v, double *host_rad,
|
||||
double **host_omega, double *host_rmass, int limit_damping,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
this->acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
this->set_kernel(eflag,vflag);
|
||||
_limit_damping = limit_damping;
|
||||
// ------------------- Resize data array --------------------------
|
||||
if (nall>_max_rad_size) {
|
||||
_max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
if (!_shared_view) {
|
||||
c_rad.resize(_max_rad_size);
|
||||
rad_tex.bind_float(c_rad,1);
|
||||
|
||||
c_omega.resize(_max_rad_size);
|
||||
omega_tex.bind_float(c_omega,4);
|
||||
|
||||
c_vel.resize(_max_rad_size);
|
||||
vel_tex.bind_float(c_vel,4);
|
||||
|
||||
c_rmass.resize(_max_rad_size);
|
||||
rmass_tex.bind_float(c_rmass,1);
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
// load balance, returning the atom count on the device (inum)
|
||||
int ago=this->hd_balancer.ago_first(f_ago);
|
||||
//this->hd_balancer.balance(cpu_time);
|
||||
//int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
|
||||
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
this->ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
if (ago==0) {
|
||||
this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
}
|
||||
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->cast_vel_data(host_v);
|
||||
this->cast_rad_data(host_rad);
|
||||
this->cast_omega_data(host_omega);
|
||||
this->cast_rmass_data(host_rmass);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
this->add_vel_data();
|
||||
this->add_rad_data();
|
||||
this->add_rmass_data();
|
||||
this->add_omega_data();
|
||||
|
||||
const int red_blocks=this->loop(eflag,vflag);
|
||||
this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
|
||||
this->device->add_ans_object(this->ans);
|
||||
this->hd_balancer.stop_timer();
|
||||
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int** GranHookeT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_v, double *host_rad,
|
||||
double **host_omega, double *host_rmass, int limit_damping,
|
||||
double *boxlo, double *prd) {
|
||||
this->acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
this->set_kernel(eflag,vflag);
|
||||
|
||||
_limit_damping= limit_damping;
|
||||
|
||||
// ------------------- Resize data array --------------------------
|
||||
if (nall>_max_rad_size) {
|
||||
_max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
if (!_shared_view) {
|
||||
c_rad.resize(_max_rad_size);
|
||||
rad_tex.bind_float(c_rad,1);
|
||||
|
||||
c_omega.resize(_max_rad_size);
|
||||
omega_tex.bind_float(c_omega,4);
|
||||
|
||||
c_vel.resize(_max_rad_size);
|
||||
vel_tex.bind_float(c_vel,4);
|
||||
|
||||
c_rmass.resize(_max_rad_size);
|
||||
rmass_tex.bind_float(c_rmass,1);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// load balance, returning the atom count on the device (inum)
|
||||
this->hd_balancer.balance(cpu_time);
|
||||
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
|
||||
this->ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return nullptr;
|
||||
this->cast_vel_data(host_v);
|
||||
this->cast_rad_data(host_rad);
|
||||
this->cast_omega_data(host_omega);
|
||||
this->cast_rmass_data(host_rmass);
|
||||
this->hd_balancer.start_timer();
|
||||
} else {
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->cast_vel_data(host_v);
|
||||
this->cast_rad_data(host_rad);
|
||||
this->cast_omega_data(host_omega);
|
||||
this->cast_rmass_data(host_rmass);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
this->add_vel_data();
|
||||
this->add_rad_data();
|
||||
this->add_rmass_data();
|
||||
this->add_omega_data();
|
||||
*ilist=this->nbor->host_ilist.begin();
|
||||
*jnum=this->nbor->host_acc.begin();
|
||||
|
||||
const int red_blocks=this->loop(eflag,vflag);
|
||||
this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
this->device->add_ans_object(this->ans);
|
||||
this->hd_balancer.stop_timer();
|
||||
|
||||
return this->nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
// Instantiate the class
|
||||
template class GranHooke<PRECISION,ACC_PRECISION>;
|
||||
491
lib/gpu/lal_gran_hooke.cu
Normal file
491
lib/gpu/lal_gran_hooke.cu
Normal file
@ -0,0 +1,491 @@
|
||||
// **************************************************************************
|
||||
// gran_hooke.cu
|
||||
// -------------------
|
||||
// Trung Dac Nguyen (ORNL)
|
||||
//
|
||||
// Device code for acceleration of the gran/hooke pair style
|
||||
//
|
||||
// __________________________________________________________________________
|
||||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************
|
||||
|
||||
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
// 确保类型定义可见
|
||||
#ifndef numtyp
|
||||
#define numtyp float
|
||||
#endif
|
||||
#ifndef numtyp4
|
||||
#define numtyp4 float4
|
||||
#endif
|
||||
#ifndef acctyp
|
||||
#define acctyp float
|
||||
#endif
|
||||
#ifndef acctyp3
|
||||
#define acctyp3 float3
|
||||
#endif
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
_texture( pos_tex,float4);
|
||||
_texture( rad_tex,float);
|
||||
_texture( vel_tex,float4);
|
||||
_texture( omega_tex,float4);
|
||||
_texture( ramss_tex,float);
|
||||
#else
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( rad_tex,int2);
|
||||
_texture_2d( vel_tex,int4);
|
||||
_texture_2d( omega_tex,int4);
|
||||
_texture( ramss_tex,int2);
|
||||
#endif
|
||||
|
||||
#if (__CUDACC_VER_MAJOR__ >= 11)
|
||||
#define rad_tex rad_
|
||||
#define vel_tex v_
|
||||
#define omega_tex omega_
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define rad_tex rad_
|
||||
#define vel_tex v_
|
||||
#define omega_tex omega_
|
||||
#define ramss_tex mass_rigid_
|
||||
#define vel_rad_tex v_rad_
|
||||
#define omega_ramss_tex omega_ramss_
|
||||
#endif
|
||||
|
||||
#define MIN(A,B) ((A) < (B) ? (A) : (B))
|
||||
|
||||
#define store_answers_t(f, tor, energy, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
f.x += shfl_down(f.x, s, t_per_atom); \
|
||||
f.y += shfl_down(f.y, s, t_per_atom); \
|
||||
f.z += shfl_down(f.z, s, t_per_atom); \
|
||||
tor.x += shfl_down(tor.x, s, t_per_atom); \
|
||||
tor.y += shfl_down(tor.y, s, t_per_atom); \
|
||||
tor.z += shfl_down(tor.z, s, t_per_atom); \
|
||||
if (EVFLAG) energy += shfl_down(energy, s, t_per_atom); \
|
||||
} \
|
||||
if (EVFLAG && vflag) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
virial[r] += shfl_down(virial[r], s, t_per_atom); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
__global acctyp *ap1=engv+ii; \
|
||||
if (EVFLAG && eflag) { \
|
||||
*ap1=energy*(acctyp)0.5; \
|
||||
ap1+=inum; \
|
||||
} \
|
||||
if (EVFLAG && vflag) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
*ap1=virial[i]*(acctyp)0.5; \
|
||||
ap1+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
ans[ii+inum]=tor; \
|
||||
}
|
||||
|
||||
__kernel void k_gran_hooke(const __global numtyp4 *restrict x_, // 位置
|
||||
const __global numtyp *restrict rad_, // 半径
|
||||
const __global numtyp4 *restrict v_, // 速度
|
||||
const __global numtyp4 *restrict omega_, // 角速度
|
||||
const __global numtyp *restrict sp_lj_in, // 特殊LJ系数
|
||||
const __global numtyp *restrict mass_rigid_, // 刚体质量
|
||||
const __global int *dev_nbor, // 邻居列表
|
||||
const __global int *dev_packed, // 打包的邻居
|
||||
__global numtyp *restrict mask_, //
|
||||
__global acctyp3 *restrict ans, // 力
|
||||
__global acctyp *restrict engv, // 能量
|
||||
const int eflag, const int vflag, // 能量/维里标志
|
||||
const int inum, const int nbor_pitch, // 原子数/邻居间距
|
||||
const int t_per_atom, // 每原子线程数
|
||||
const numtyp kn, const numtyp kt, // 法向/切向刚度
|
||||
const numtyp gamman, const numtyp gammat, // 法向/切向阻尼
|
||||
const numtyp xmu, const int dampflag, // 摩擦系数/阻尼标志
|
||||
const int freeze_group_bit, // 冻结组位
|
||||
const int limit_damping, // damping限制标志位
|
||||
const numtyp dt) { // 时间步长
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp3 f, torque_i;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
torque_i.x=(acctyp)0; torque_i.y=(acctyp)0; torque_i.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
numtyp radi; fetch(radi,i,rad_tex);
|
||||
numtyp rmassi; fetch(rmassi,i,mass_rigid_);
|
||||
numtyp4 iv; fetch4(iv,i,vel_tex);
|
||||
numtyp4 iomega; fetch4(iomega,i,omega_tex);
|
||||
int maski = mask_[i];
|
||||
|
||||
numtyp factor_lj;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex);
|
||||
numtyp radj; fetch(radj,j,rad_tex);
|
||||
numtyp rmassj; fetch(rmassj,j,mass_rigid_);
|
||||
int maskj = mask_[j];
|
||||
|
||||
|
||||
if (factor_lj == 0.0) continue;
|
||||
// 计算相对位置
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
numtyp radsum = radi + radj;
|
||||
|
||||
if (rsq < radsum*radsum)
|
||||
{
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
numtyp rinv = ucl_recip(r);
|
||||
numtyp rsqinv = ucl_recip(rsq);
|
||||
|
||||
// 相对平移速度
|
||||
numtyp4 jv; fetch4(jv,j,vel_tex);
|
||||
numtyp vr1 = iv.x-jv.x;
|
||||
numtyp vr2 = iv.y-jv.y;
|
||||
numtyp vr3 = iv.z-jv.z;
|
||||
|
||||
// 法向分量
|
||||
numtyp vnnr = vr1*delx + vr2*dely + vr3*delz;
|
||||
numtyp vn1 = delx*vnnr*rsqinv;
|
||||
numtyp vn2 = dely*vnnr*rsqinv;
|
||||
numtyp vn3 = delz*vnnr*rsqinv;
|
||||
|
||||
// 切向分量
|
||||
numtyp vt1 = vr1 - vn1;
|
||||
numtyp vt2 = vr2 - vn2;
|
||||
numtyp vt3 = vr3 - vn3;
|
||||
|
||||
// 相对旋转速度
|
||||
|
||||
numtyp4 jomega; fetch4(jomega,j,omega_tex);
|
||||
numtyp wr1 = (radi*iomega.x + radj*jomega.x)*rinv;
|
||||
numtyp wr2 = (radi*iomega.y + radj*jomega.y)*rinv;
|
||||
numtyp wr3 = (radi*iomega.z + radj*jomega.z)*rinv;
|
||||
|
||||
// 有效质量
|
||||
numtyp mi = rmassi;
|
||||
numtyp mj = rmassj;
|
||||
numtyp meff = (mi*mj)/(mi+mj);
|
||||
if (maski & freeze_group_bit) meff = mj;
|
||||
if (maskj & freeze_group_bit) meff = mi;
|
||||
|
||||
// 法向力 = 弹性 + 阻尼
|
||||
numtyp damp = meff*gamman*vnnr*rsqinv;
|
||||
numtyp ccel = kn*(radsum-r)*rinv - damp;
|
||||
if (limit_damping&&(ccel < (numtyp)0.0)) ccel = (numtyp)0.0;
|
||||
|
||||
// 切向速度(考虑旋转)
|
||||
numtyp vtr1 = vt1 - (delz*wr2 - dely*wr3);
|
||||
numtyp vtr2 = vt2 - (delx*wr3 - delz*wr1);
|
||||
numtyp vtr3 = vt3 - (dely*wr1 - delx*wr2);
|
||||
numtyp vrel = ucl_sqrt(vtr1*vtr1+vtr2*vtr2+vtr3*vtr3);
|
||||
|
||||
numtyp fn = xmu * ucl_abs(ccel*r);
|
||||
numtyp fs = meff*gammat*vrel;
|
||||
|
||||
numtyp ft = 0.0;
|
||||
if (vrel != 0.0) ft = MIN(fn,fs) / vrel;
|
||||
|
||||
numtyp fs1 = -ft*vtr1;
|
||||
numtyp fs2 = -ft*vtr2;
|
||||
numtyp fs3 = -ft*vtr3;
|
||||
|
||||
|
||||
numtyp fx = delx*ccel + fs1;
|
||||
numtyp fy = dely*ccel + fs2;
|
||||
numtyp fz = delz*ccel + fs3;
|
||||
|
||||
|
||||
fx = factor_lj*fx;
|
||||
fy = factor_lj*fy;
|
||||
fz = factor_lj*fz;
|
||||
|
||||
f.x+=fx;
|
||||
f.y+=fy;
|
||||
f.z+=fz;
|
||||
|
||||
// 力矩计算 (与CPU一致: torque = rinv * (r × fs))
|
||||
numtyp tor1 = rinv * (dely*fs3 - delz*fs2);
|
||||
numtyp tor2 = rinv * (delz*fs1 - delx*fs3);
|
||||
numtyp tor3 = rinv * (delx*fs2 - dely*fs1);
|
||||
|
||||
tor1 *= factor_lj;
|
||||
tor2 *= factor_lj;
|
||||
tor3 *= factor_lj;
|
||||
|
||||
// 更新i粒子力矩 (负号表示作用反力)
|
||||
torque_i.x -= radi * tor1;
|
||||
torque_i.y -= radi * tor2;
|
||||
torque_i.z -= radi * tor3;
|
||||
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*fx;
|
||||
virial[1] += dely*fy;
|
||||
virial[2] += delz*fz;
|
||||
virial[3] += delx*fy;
|
||||
virial[4] += delx*fz;
|
||||
virial[5] += dely*fz;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
store_answers_t(f,torque_i,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
}
|
||||
|
||||
__kernel void k_gran_hooke_fast(const __global numtyp4 *restrict x_, // 位置
|
||||
const __global numtyp *restrict rad_, // 半径
|
||||
const __global numtyp4 *restrict v_, // 速度
|
||||
const __global numtyp4 *restrict omega_, // 角速度
|
||||
const __global numtyp *restrict sp_lj_in, // 特殊LJ系数
|
||||
const __global numtyp *restrict mass_rigid_, // 刚体质量
|
||||
const __global int *dev_nbor, // 邻居列表
|
||||
const __global int *dev_packed, // 打包的邻居
|
||||
__global numtyp *restrict mask_, //
|
||||
__global acctyp3 *restrict ans, // 力
|
||||
__global acctyp *restrict engv, // 能量
|
||||
const int eflag, const int vflag, // 能量/维里标志
|
||||
const int inum, const int nbor_pitch, // 原子数/邻居间距
|
||||
const int t_per_atom, // 每原子线程数
|
||||
const numtyp kn, const numtyp kt, // 法向/切向刚度
|
||||
const numtyp gamman, const numtyp gammat, // 法向/切向阻尼
|
||||
const numtyp xmu, const int dampflag, // 摩擦系数/阻尼标志
|
||||
const int freeze_group_bit, // 冻结组位
|
||||
const int limit_damping, // 剪切位移更新标志位
|
||||
const numtyp dt) { // 时间步长
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
|
||||
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
acctyp3 f, torque_i;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
torque_i.x=(acctyp)0; torque_i.y=(acctyp)0; torque_i.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
if (tid<4) {
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
}
|
||||
|
||||
//__local numtyp4 ixs[64];
|
||||
__local numtyp4 ivs[64];
|
||||
__local numtyp4 iomegas[64];
|
||||
__local numtyp radis[64];
|
||||
__local numtyp rmassis[64];
|
||||
|
||||
int idx=0;
|
||||
idx=int(THREAD_ID_X/t_per_atom);
|
||||
if(THREAD_ID_X%t_per_atom==0&&ii<inum)
|
||||
{
|
||||
//fetch4(ixs[idx],i,pos_tex);
|
||||
fetch4(iomegas[idx],i,omega_tex);
|
||||
fetch4(ivs[idx],i,vel_tex);
|
||||
fetch(rmassis[idx],i,mass_rigid_);
|
||||
fetch(radis[idx],i,rad_tex);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
numtyp radi; radi=radis[idx];
|
||||
numtyp rmassi; rmassi=rmassis[idx];
|
||||
numtyp4 iomega; iomega=iomegas[idx];
|
||||
numtyp4 iv; iv=ivs[idx];
|
||||
/*
|
||||
|
||||
|
||||
__syncthreads();
|
||||
//numtyp4 ix; ix=ixs[idx];
|
||||
numtyp radi; radi=radis[idx];
|
||||
//numtyp rmassi; rmassi=rmassis[idx];
|
||||
//numtyp4 iv; iv=ivs[idx];
|
||||
//numtyp4 iomega; iomega=iomegas[idx];
|
||||
*/
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
//numtyp radi; fetch(radi,i,rad_tex);
|
||||
//numtyp rmassi; fetch(rmassi,i,mass_rigid_);
|
||||
//numtyp4 iv; fetch4(iv,i,vel_tex);
|
||||
//numtyp4 iomega; fetch4(iomega,i,omega_tex);
|
||||
|
||||
int maski = mask_[i];
|
||||
|
||||
numtyp factor_lj;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_packed+nbor+n_stride);
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex);
|
||||
numtyp radj; fetch(radj,j,rad_tex);
|
||||
numtyp rmassj; fetch(rmassj,j,mass_rigid_);
|
||||
int maskj = mask_[j];
|
||||
|
||||
|
||||
if (factor_lj == 0.0) continue;
|
||||
// 计算相对位置
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
numtyp radsum = radi + radj;
|
||||
|
||||
if (rsq < radsum*radsum)
|
||||
{
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
numtyp rinv = ucl_recip(r);
|
||||
numtyp rsqinv = ucl_recip(rsq);
|
||||
|
||||
// 相对平移速度
|
||||
numtyp4 jv; fetch4(jv,j,vel_tex);
|
||||
numtyp vr1 = iv.x-jv.x;
|
||||
numtyp vr2 = iv.y-jv.y;
|
||||
numtyp vr3 = iv.z-jv.z;
|
||||
|
||||
// 法向分量
|
||||
numtyp vnnr = vr1*delx + vr2*dely + vr3*delz;
|
||||
numtyp vn1 = delx*vnnr*rsqinv;
|
||||
numtyp vn2 = dely*vnnr*rsqinv;
|
||||
numtyp vn3 = delz*vnnr*rsqinv;
|
||||
|
||||
// 切向分量
|
||||
numtyp vt1 = vr1 - vn1;
|
||||
numtyp vt2 = vr2 - vn2;
|
||||
numtyp vt3 = vr3 - vn3;
|
||||
|
||||
// 相对旋转速度
|
||||
|
||||
numtyp4 jomega; fetch4(jomega,j,omega_tex);
|
||||
numtyp wr1 = (radi*iomega.x + radj*jomega.x)*rinv;
|
||||
numtyp wr2 = (radi*iomega.y + radj*jomega.y)*rinv;
|
||||
numtyp wr3 = (radi*iomega.z + radj*jomega.z)*rinv;
|
||||
|
||||
// 有效质量
|
||||
numtyp mi = rmassi;
|
||||
numtyp mj = rmassj;
|
||||
numtyp meff = (mi*mj)/(mi+mj);
|
||||
if (maski & freeze_group_bit) meff = mj;
|
||||
if (maskj & freeze_group_bit) meff = mi;
|
||||
|
||||
// 法向力 = 弹性 + 阻尼
|
||||
numtyp damp = meff*gamman*vnnr*rsqinv;
|
||||
numtyp ccel = kn*(radsum-r)*rinv - damp;
|
||||
if (limit_damping&&(ccel < (numtyp)0.0)) ccel = (numtyp)0.0;
|
||||
|
||||
// 切向速度(考虑旋转)
|
||||
numtyp vtr1 = vt1 - (delz*wr2 - dely*wr3);
|
||||
numtyp vtr2 = vt2 - (delx*wr3 - delz*wr1);
|
||||
numtyp vtr3 = vt3 - (dely*wr1 - delx*wr2);
|
||||
numtyp vrel = ucl_sqrt(vtr1*vtr1+vtr2*vtr2+vtr3*vtr3);
|
||||
|
||||
numtyp fn = xmu * ucl_abs(ccel*r);
|
||||
numtyp fs = meff*gammat*vrel;
|
||||
|
||||
numtyp ft = 0.0;
|
||||
if (vrel != 0.0) ft = MIN(fn,fs) / vrel;
|
||||
|
||||
numtyp fs1 = -ft*vtr1;
|
||||
numtyp fs2 = -ft*vtr2;
|
||||
numtyp fs3 = -ft*vtr3;
|
||||
|
||||
|
||||
numtyp fx = delx*ccel + fs1;
|
||||
numtyp fy = dely*ccel + fs2;
|
||||
numtyp fz = delz*ccel + fs3;
|
||||
|
||||
fx = factor_lj*fx;
|
||||
fy = factor_lj*fy;
|
||||
fz = factor_lj*fz;
|
||||
|
||||
f.x+=fx;
|
||||
f.y+=fy;
|
||||
f.z+=fz;
|
||||
|
||||
// 力矩计算 (与CPU一致: torque = rinv * (r × fs))
|
||||
numtyp tor1 = rinv * (dely*fs3 - delz*fs2);
|
||||
numtyp tor2 = rinv * (delz*fs1 - delx*fs3);
|
||||
numtyp tor3 = rinv * (delx*fs2 - dely*fs1);
|
||||
|
||||
tor1 *= factor_lj;
|
||||
tor2 *= factor_lj;
|
||||
tor3 *= factor_lj;
|
||||
|
||||
// 更新i粒子力矩 (负号表示作用反力)
|
||||
torque_i.x -= radi * tor1;
|
||||
torque_i.y -= radi * tor2;
|
||||
torque_i.z -= radi * tor3;
|
||||
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*fx;
|
||||
virial[1] += dely*fy;
|
||||
virial[2] += delz*fz;
|
||||
virial[3] += delx*fy;
|
||||
virial[4] += delx*fz;
|
||||
virial[5] += dely*fz;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
store_answers_t(f,torque_i,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
}
|
||||
252
lib/gpu/lal_gran_hooke.h
Normal file
252
lib/gpu/lal_gran_hooke.h
Normal file
@ -0,0 +1,252 @@
|
||||
/***************************************************************************
|
||||
* gran_hooke.h
|
||||
* -------------------
|
||||
* Trung Dac Nguyen (ORNL)
|
||||
*
|
||||
* Class for acceleration of the gran/hooke pair style.
|
||||
*
|
||||
* __________________________________________________________________________
|
||||
* This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
* __________________________________________________________________________
|
||||
*
|
||||
* begin :
|
||||
* email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef LAL_GRAN_HOOKE_HISTORY_H
|
||||
#define LAL_GRAN_HOOKE_HISTORY_H
|
||||
|
||||
#include "lal_base_sphere.h"
|
||||
#include "lal_atom.h"
|
||||
|
||||
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class GranHooke : public BaseSphere<numtyp, acctyp> {
|
||||
public:
|
||||
GranHooke();
|
||||
~GranHooke();
|
||||
|
||||
typedef struct { double x,y,z; } vec3d;
|
||||
typedef struct { numtyp x,y,z,w; } vec4d_t;
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes,
|
||||
double host_k_n, double host_k_t,
|
||||
double host_gamman, double host_gammat,
|
||||
double host_xmu, double host_dt, const int host_dampflag,
|
||||
double *host_special_lj, int *host_mask,
|
||||
const int nlocal, const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, const double gpu_split, FILE *screen);
|
||||
|
||||
inline void cast_rad_data(double* rad) {
|
||||
int nall = this->atom->nall();
|
||||
if (_shared_view) {
|
||||
c_rad.host.view((numtyp*)rad,nall,*(this->ucl_device));
|
||||
c_rad.device.view(c_rad.host);
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(c_rad.host.begin(),rad,nall*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<nall; i++) c_rad[i]=rad[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Copy rad to device asynchronously
|
||||
inline void add_rad_data() {
|
||||
c_rad.update_device(this->atom->nall(),true);
|
||||
}
|
||||
|
||||
inline void cast_rmass_data(double* rmass) {
|
||||
int nall = this->atom->nall();
|
||||
if (_shared_view) {
|
||||
c_rmass.host.view((numtyp*)rmass,nall,*(this->ucl_device));
|
||||
c_rmass.device.view(c_rmass.host);
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(c_rmass.host.begin(),rmass,nall*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<nall; i++) c_rmass[i]=rmass[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Copy rmass to device asynchronously
|
||||
inline void add_rmass_data() {
|
||||
c_rmass.update_device(this->atom->nall(),true);
|
||||
}
|
||||
|
||||
/*inline void cast_omega_data(double** omega) {
|
||||
int nall = this->atom->nall();
|
||||
if (_shared_view) {
|
||||
c_omega.host.view((numtyp*)omega,nall,*(this->ucl_device));
|
||||
c_omega.device.view(c_omega.host);
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(c_omega.host.begin(),omega,nall*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<nall; i++)
|
||||
{
|
||||
c_omega[i]=omega[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Copy rmass to device asynchronously
|
||||
inline void add_omega_data() {
|
||||
c_omega.update_device(this->atom->nall(),true);
|
||||
}*/
|
||||
|
||||
inline void cast_omega_data(double** omega) {
|
||||
int nall = this->atom->nall();
|
||||
#ifdef GPU_CAST
|
||||
memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||
memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int));
|
||||
#else
|
||||
vec3d *host_p=reinterpret_cast<vec3d*>(&(omega[0][0]));
|
||||
vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(c_omega[0]));
|
||||
#if (LAL_USE_OMP == 1)
|
||||
#pragma omp parallel for schedule(static)
|
||||
#endif
|
||||
for (int i=0; i<nall; i++) {
|
||||
vp[i].x=host_p[i].x;
|
||||
vp[i].y=host_p[i].y;
|
||||
vp[i].z=host_p[i].z;
|
||||
vp[i].w=0.0;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// Copy rmass to device asynchronously
|
||||
inline void add_omega_data() {
|
||||
c_omega.update_device(this->atom->nall()*4,true);
|
||||
}
|
||||
|
||||
inline void cast_vel_data(double** vel) {
|
||||
int nall = this->atom->nall();
|
||||
#ifdef GPU_CAST
|
||||
memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||
memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int));
|
||||
#else
|
||||
vec3d *host_p=reinterpret_cast<vec3d*>(&(vel[0][0]));
|
||||
vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(c_vel[0]));
|
||||
#if (LAL_USE_OMP == 1)
|
||||
#pragma omp parallel for schedule(static)
|
||||
#endif
|
||||
for (int i=0; i<nall; i++) {
|
||||
vp[i].x=host_p[i].x;
|
||||
vp[i].y=host_p[i].y;
|
||||
vp[i].z=host_p[i].z;
|
||||
vp[i].w=0.0;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// Copy rmass to device asynchronously
|
||||
inline void add_vel_data() {
|
||||
c_vel.update_device(this->atom->nall()*4,true);
|
||||
}
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
void compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, tagint *tag,
|
||||
double **host_v, double *host_rad,
|
||||
double **host_omega, double *host_rmass, int limit_damping,
|
||||
const int nlocal, double *boxlo, double *prd);
|
||||
|
||||
int **compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_v, double *host_rad,
|
||||
double **host_omega, double *host_rmass, int limit_damping,
|
||||
double *boxlo, double *prd);
|
||||
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// coeff.x = k_n, coeff.y = k_t, coeff.z = cutsq, coeff.w = gamman
|
||||
UCL_D_Vec<numtyp4> coeff;
|
||||
|
||||
numtyp _k_n;
|
||||
numtyp _k_t;
|
||||
numtyp _gamman;
|
||||
numtyp _gammat;
|
||||
numtyp _xmu;
|
||||
numtyp _dt;
|
||||
numtyp _dampflag;
|
||||
int _limit_damping;
|
||||
/// coeff2.x = gammat, coeff2.y = xmu
|
||||
UCL_D_Vec<numtyp2> coeff2;
|
||||
/// Special LJ values
|
||||
UCL_D_Vec<numtyp> sp_lj;
|
||||
|
||||
UCL_D_Vec<numtyp> _mask;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
/// Per-atom arrays
|
||||
UCL_Vector<numtyp,numtyp> c_rad;
|
||||
|
||||
UCL_Vector<numtyp,numtyp> c_omega;
|
||||
|
||||
UCL_Vector<numtyp,numtyp> c_vel;
|
||||
|
||||
UCL_Vector<numtyp,numtyp> c_rmass;
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture rad_tex;
|
||||
|
||||
UCL_Texture omega_tex;
|
||||
|
||||
UCL_Texture vel_tex;
|
||||
|
||||
UCL_Texture rmass_tex;
|
||||
|
||||
numtyp freeze_group_bit;
|
||||
|
||||
int _max_rad_size;
|
||||
|
||||
private:
|
||||
bool _shared_view;
|
||||
bool _allocated;
|
||||
int loop(const int _eflag, const int _vflag);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
134
lib/gpu/lal_gran_hooke_ext.cpp
Normal file
134
lib/gpu/lal_gran_hooke_ext.cpp
Normal file
@ -0,0 +1,134 @@
|
||||
/***************************************************************************
|
||||
* gran_hooke_ext.cpp
|
||||
* -------------------
|
||||
* Trung Dac Nguyen (ORNL)
|
||||
*
|
||||
* Functions for LAMMPS access to gran/hooke acceleration routines.
|
||||
*
|
||||
* __________________________________________________________________________
|
||||
* This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
* __________________________________________________________________________
|
||||
*
|
||||
* begin :
|
||||
* email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
||||
#include "lal_gran_hooke.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace LAMMPS_AL;
|
||||
|
||||
static GranHooke<PRECISION,ACC_PRECISION> GHM;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int gran_hooke_gpu_init(const int ntypes, double **cutsq,
|
||||
const double host_k_n, const double host_k_t,
|
||||
const double host_gamman, const double host_gammat,
|
||||
const double host_xmu, const double host_dt, const int host_dampflag,
|
||||
double *special_lj, int *host_mask,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen) {
|
||||
GHM.clear();
|
||||
gpu_mode=GHM.device->gpu_mode();
|
||||
double gpu_split=GHM.device->particle_split();
|
||||
int first_gpu=GHM.device->first_device();
|
||||
int last_gpu=GHM.device->last_device();
|
||||
int world_me=GHM.device->world_me();
|
||||
int gpu_rank=GHM.device->gpu_rank();
|
||||
int procs_per_gpu=GHM.device->procs_per_gpu();
|
||||
|
||||
GHM.device->init_message(screen,"gran/hooke",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (GHM.device->replica_me()==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=GHM.init(ntypes, host_k_n, host_k_t, host_gamman,
|
||||
host_gammat, host_xmu, host_dt, host_dampflag, special_lj,
|
||||
host_mask, inum, nall, max_nbors, maxspecial, cell_size,
|
||||
gpu_split, screen);
|
||||
|
||||
GHM.device->world_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=GHM.init(ntypes, host_k_n, host_k_t, host_gamman,
|
||||
host_gammat, host_xmu, host_dt, host_dampflag, special_lj,
|
||||
host_mask, inum, nall, max_nbors, maxspecial, cell_size,
|
||||
gpu_split, screen);
|
||||
|
||||
GHM.device->serialize_init();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
|
||||
if (init_ok==0)
|
||||
GHM.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void gran_hooke_gpu_clear() {
|
||||
GHM.clear();
|
||||
}
|
||||
|
||||
int ** gran_hooke_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_v, double *host_rad,
|
||||
double **host_omega, double *host_rmass, int limit_damping,
|
||||
double *boxlo, double *prd) {
|
||||
return GHM.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
|
||||
tag, nspecial, special, eflag, vflag, eatom, vatom,
|
||||
host_start, ilist, jnum, cpu_time, success, host_v, host_rad,
|
||||
host_omega, host_rmass, limit_damping, boxlo, prd);
|
||||
}
|
||||
|
||||
void gran_hooke_gpu_compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, tagint *tag,
|
||||
double **host_v, double *host_rad,
|
||||
double **host_omega, double *host_rmass, int limit_damping,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
GHM.compute(ago, inum_full, nall, host_x, host_type, ilist, numj,
|
||||
firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time,
|
||||
success, tag, host_v, host_rad, host_omega, host_rmass, limit_damping, nlocal, boxlo, prd);
|
||||
}
|
||||
|
||||
double gran_hooke_gpu_bytes() {
|
||||
return GHM.host_memory_usage();
|
||||
}
|
||||
477
src/GPU/pair_gran_hooke_gpu.cpp
Normal file
477
src/GPU/pair_gran_hooke_gpu.cpp
Normal file
@ -0,0 +1,477 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
https://www.lammps.org/, Sandia National Laboratories
|
||||
LAMMPS development team: developers@lammps.org
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: Trung Dac Nguyen (ORNL)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "pair_gran_hooke_gpu.h"
|
||||
|
||||
#include "atom.h"
|
||||
#include "domain.h"
|
||||
#include "error.h"
|
||||
#include "force.h"
|
||||
#include "gpu_extra.h"
|
||||
#include "neigh_list.h"
|
||||
#include "neighbor.h"
|
||||
#include "suffix.h"
|
||||
#include "update.h"
|
||||
#include "memory.h"
|
||||
#include "comm.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
// External functions from cuda library for atom decomposition
|
||||
|
||||
int gran_hooke_gpu_init(const int ntypes, double **cutsq,
|
||||
const double host_k_n, const double host_k_t,
|
||||
const double host_gamman, const double host_gammat,
|
||||
const double host_xmu, const double host_dt, const int host_dampflag,
|
||||
double *special_lj, int *host_mask,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen);
|
||||
void gran_hooke_gpu_clear();
|
||||
int ** gran_hooke_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_v, double *host_rad,
|
||||
double **host_omega, double *host_rmass, int limit_damping,
|
||||
double *boxlo, double *prd);
|
||||
void gran_hooke_gpu_compute(const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, tagint *tag,
|
||||
double **host_v, double *host_rad,
|
||||
double **host_omega, double *host_rmass, int limit_damping,
|
||||
const int nlocal, double *boxlo, double *prd);
|
||||
double gran_hooke_gpu_bytes();
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
PairGranHookeGPU::PairGranHookeGPU(LAMMPS *lmp) :
|
||||
PairGranHooke(lmp), gpu_mode(GPU_FORCE)
|
||||
{
|
||||
respa_enable = 0;
|
||||
reinitflag = 0;
|
||||
cpu_time = 0.0;
|
||||
suffix_flag |= Suffix::GPU;
|
||||
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
free all arrays
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
PairGranHookeGPU::~PairGranHookeGPU()
|
||||
{
|
||||
gran_hooke_gpu_clear();
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void PairGranHookeGPU::compute(int eflag, int vflag)
|
||||
{
|
||||
ev_init(eflag, vflag);
|
||||
|
||||
int nall = atom->nlocal + atom->nghost;
|
||||
int inum, host_start;
|
||||
|
||||
bool success = true;
|
||||
int *ilist, *numneigh, **firstneigh;
|
||||
|
||||
int shearupdate = 1;
|
||||
if (update->setupflag) shearupdate = 0;
|
||||
|
||||
if (gpu_mode != GPU_FORCE) {
|
||||
double sublo[3], subhi[3];
|
||||
if (domain->triclinic == 0) {
|
||||
sublo[0] = domain->sublo[0];
|
||||
sublo[1] = domain->sublo[1];
|
||||
sublo[2] = domain->sublo[2];
|
||||
subhi[0] = domain->subhi[0];
|
||||
subhi[1] = domain->subhi[1];
|
||||
subhi[2] = domain->subhi[2];
|
||||
} else {
|
||||
domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi);
|
||||
}
|
||||
|
||||
|
||||
inum = atom->nlocal;
|
||||
firstneigh = gran_hooke_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type,
|
||||
sublo, subhi, atom->tag, atom->nspecial,
|
||||
atom->special, eflag, vflag, eflag_atom,
|
||||
vflag_atom, host_start, &ilist, &numneigh,
|
||||
cpu_time, success, atom->v,atom->radius, atom->omega, atom->rmass,
|
||||
limit_damping, domain->boxlo, domain->prd);
|
||||
} else {
|
||||
inum = list->inum;
|
||||
ilist = list->ilist;
|
||||
numneigh = list->numneigh;
|
||||
firstneigh = list->firstneigh;
|
||||
gran_hooke_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh,
|
||||
firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start,
|
||||
cpu_time, success, atom->tag, atom->v, atom->radius, atom->omega, atom->rmass,
|
||||
limit_damping, atom->nlocal, domain->boxlo, domain->prd);
|
||||
}
|
||||
if (!success) error->one(FLERR, "Insufficient memory on accelerator");
|
||||
|
||||
if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
|
||||
neighbor->build_topology();
|
||||
if (host_start < inum) {
|
||||
cpu_time = platform::walltime();
|
||||
cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
|
||||
cpu_time = platform::walltime() - cpu_time;
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
init specific to this pair style
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairGranHookeGPU::init_style()
|
||||
{
|
||||
if (!atom->radius_flag || !atom->rmass_flag || !atom->omega_flag)
|
||||
error->all(FLERR, "Pair style gran/hooke/history/gpu requires atom attributes radius, rmass, omega");
|
||||
if (comm->ghost_velocity == 0)
|
||||
error->all(FLERR, "Pair gran/h* requires ghost atoms store velocity");
|
||||
|
||||
if (gpu_mode == GPU_FORCE)
|
||||
{
|
||||
if (history)
|
||||
neighbor->add_request(this, NeighConst::REQ_SIZE | NeighConst::REQ_HISTORY);
|
||||
else
|
||||
neighbor->add_request(this, NeighConst::REQ_SIZE);
|
||||
}
|
||||
|
||||
// Repeat cutsq calculation because done after call to init_style
|
||||
double maxcut = -1.0;
|
||||
double cut;
|
||||
for (int i = 1; i <= atom->ntypes; i++) {
|
||||
for (int j = i; j <= atom->ntypes; j++) {
|
||||
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
|
||||
cut = init_one(i, j);
|
||||
cut *= cut;
|
||||
if (cut > maxcut) maxcut = cut;
|
||||
cutsq[i][j] = cutsq[j][i] = cut;
|
||||
} else
|
||||
cutsq[i][j] = cutsq[j][i] = 0.0;
|
||||
}
|
||||
}
|
||||
double cell_size = sqrt(maxcut) + neighbor->skin;
|
||||
|
||||
int maxspecial = 0;
|
||||
if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial;
|
||||
int mnf = 5e-2 * neighbor->oneatom;
|
||||
//dt = update->dt;
|
||||
|
||||
if (history && (fix_history == nullptr)) {
|
||||
auto cmd = fmt::format("NEIGH_HISTORY_HH{} all NEIGH_HISTORY {}", instance_me, size_history);
|
||||
fix_history = dynamic_cast<FixNeighHistory *>(
|
||||
modify->replace_fix("NEIGH_HISTORY_HH_DUMMY" + std::to_string(instance_me), cmd, 1));
|
||||
fix_history->pair = this;
|
||||
}
|
||||
|
||||
// check for FixFreeze and set freeze_group_bit
|
||||
|
||||
auto fixlist = modify->get_fix_by_style("^freeze");
|
||||
if (fixlist.size() == 0)
|
||||
freeze_group_bit = 0;
|
||||
else if (fixlist.size() > 1)
|
||||
error->all(FLERR, "Only one fix freeze command at a time allowed");
|
||||
else
|
||||
freeze_group_bit = fixlist.front()->groupbit;
|
||||
|
||||
// check for FixRigid so can extract rigid body masses
|
||||
|
||||
fix_rigid = nullptr;
|
||||
for (const auto &ifix : modify->get_fix_list()) {
|
||||
if (ifix->rigid_flag) {
|
||||
if (fix_rigid)
|
||||
error->all(FLERR, "Only one fix rigid command at a time allowed");
|
||||
else
|
||||
fix_rigid = ifix;
|
||||
}
|
||||
}
|
||||
|
||||
// check for FixPour and FixDeposit so can extract particle radii
|
||||
|
||||
auto pours = modify->get_fix_by_style("^pour");
|
||||
auto deps = modify->get_fix_by_style("^deposit");
|
||||
|
||||
// set maxrad_dynamic and maxrad_frozen for each type
|
||||
// include future FixPour and FixDeposit particles as dynamic
|
||||
|
||||
int itype;
|
||||
for (int i = 1; i <= atom->ntypes; i++) {
|
||||
onerad_dynamic[i] = onerad_frozen[i] = 0.0;
|
||||
for (auto &ipour : pours) {
|
||||
itype = i;
|
||||
double maxrad = *((double *) ipour->extract("radius", itype));
|
||||
if (maxrad > 0.0) onerad_dynamic[i] = maxrad;
|
||||
}
|
||||
for (auto &idep : deps) {
|
||||
itype = i;
|
||||
double maxrad = *((double *) idep->extract("radius", itype));
|
||||
if (maxrad > 0.0) onerad_dynamic[i] = maxrad;
|
||||
}
|
||||
}
|
||||
|
||||
double *radius = atom->radius;
|
||||
int *mask = atom->mask;
|
||||
int *type = atom->type;
|
||||
int nlocal = atom->nlocal;
|
||||
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
if (mask[i] & freeze_group_bit)
|
||||
onerad_frozen[type[i]] = MAX(onerad_frozen[type[i]], radius[i]);
|
||||
else
|
||||
onerad_dynamic[type[i]] = MAX(onerad_dynamic[type[i]], radius[i]);
|
||||
}
|
||||
|
||||
MPI_Allreduce(&onerad_dynamic[1], &maxrad_dynamic[1], atom->ntypes, MPI_DOUBLE, MPI_MAX, world);
|
||||
MPI_Allreduce(&onerad_frozen[1], &maxrad_frozen[1], atom->ntypes, MPI_DOUBLE, MPI_MAX, world);
|
||||
|
||||
// set fix which stores history info
|
||||
|
||||
if (history) {
|
||||
fix_history = dynamic_cast<FixNeighHistory *>(
|
||||
modify->get_fix_by_id("NEIGH_HISTORY_HH" + std::to_string(instance_me)));
|
||||
if (!fix_history) error->all(FLERR, "Could not find pair fix neigh history ID");
|
||||
}
|
||||
|
||||
int success = gran_hooke_gpu_init(atom->ntypes + 1, cutsq, kn, kt, gamman, gammat, xmu, update->dt,
|
||||
dampflag, force->special_lj, atom->mask,
|
||||
atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial,
|
||||
cell_size, gpu_mode, screen);
|
||||
GPU_EXTRA::check_flag(success, error, world);
|
||||
|
||||
|
||||
}
|
||||
|
||||
void PairGranHookeGPU::cpu_compute(int start, int inum, int eflag, int vflag, int *ilist,
|
||||
int *numneigh, int **firstneigh)
|
||||
{
|
||||
int i, j, ii, jj, jnum;
|
||||
double xtmp, ytmp, ztmp, delx, dely, delz, fx, fy, fz;
|
||||
double radi, radj, radsum, rsq, r, rinv, rsqinv, factor_lj;
|
||||
double vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3;
|
||||
double wr1, wr2, wr3;
|
||||
double vtr1, vtr2, vtr3, vrel;
|
||||
double mi, mj, meff, damp, ccel, tor1, tor2, tor3;
|
||||
double fn,fs,ft,fs1,fs2,fs3;
|
||||
double shrmag, rsht;
|
||||
int *jlist;
|
||||
int *touch, **firsttouch;
|
||||
double *shear, *allshear, **firstshear;
|
||||
|
||||
ev_init(eflag, vflag);
|
||||
|
||||
int shearupdate = 1;
|
||||
if (update->setupflag) shearupdate = 0;
|
||||
|
||||
// update rigid body info for owned & ghost atoms if using FixRigid masses
|
||||
// body[i] = which body atom I is in, -1 if none
|
||||
// mass_body = mass of each rigid body
|
||||
|
||||
if (fix_rigid && neighbor->ago == 0) {
|
||||
int tmp;
|
||||
int *body = (int *) fix_rigid->extract("body", tmp);
|
||||
auto mass_body = (double *) fix_rigid->extract("masstotal", tmp);
|
||||
if (atom->nmax > nmax) {
|
||||
memory->destroy(mass_rigid);
|
||||
nmax = atom->nmax;
|
||||
memory->create(mass_rigid, nmax, "pair:mass_rigid");
|
||||
}
|
||||
int nlocal = atom->nlocal;
|
||||
for (i = 0; i < nlocal; i++)
|
||||
if (body[i] >= 0)
|
||||
mass_rigid[i] = mass_body[body[i]];
|
||||
else
|
||||
mass_rigid[i] = 0.0;
|
||||
comm->forward_comm(this);
|
||||
}
|
||||
|
||||
double **x = atom->x;
|
||||
double **v = atom->v;
|
||||
double **f = atom->f;
|
||||
double **omega = atom->omega;
|
||||
double **torque = atom->torque;
|
||||
double *radius = atom->radius;
|
||||
double *rmass = atom->rmass;
|
||||
int *mask = atom->mask;
|
||||
int nlocal = atom->nlocal;
|
||||
int newton_pair = force->newton_pair;
|
||||
double *special_lj = force->special_lj;
|
||||
|
||||
inum = list->inum;
|
||||
ilist = list->ilist;
|
||||
numneigh = list->numneigh;
|
||||
firstneigh = list->firstneigh;
|
||||
firsttouch = fix_history->firstflag;
|
||||
firstshear = fix_history->firstvalue;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
|
||||
for (ii = 0; ii < inum; ii++) {
|
||||
i = ilist[ii];
|
||||
xtmp = x[i][0];
|
||||
ytmp = x[i][1];
|
||||
ztmp = x[i][2];
|
||||
radi = radius[i];
|
||||
jlist = firstneigh[i];
|
||||
jnum = numneigh[i];
|
||||
|
||||
for (jj = 0; jj < jnum; jj++) {
|
||||
j = jlist[jj];
|
||||
factor_lj = special_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
if (factor_lj == 0) continue;
|
||||
|
||||
delx = xtmp - x[j][0];
|
||||
dely = ytmp - x[j][1];
|
||||
delz = ztmp - x[j][2];
|
||||
rsq = delx*delx + dely*dely + delz*delz;
|
||||
radj = radius[j];
|
||||
radsum = radi + radj;
|
||||
|
||||
if (rsq < radsum*radsum) {
|
||||
r = sqrt(rsq);
|
||||
rinv = 1.0/r;
|
||||
rsqinv = 1.0/rsq;
|
||||
|
||||
// relative translational velocity
|
||||
|
||||
vr1 = v[i][0] - v[j][0];
|
||||
vr2 = v[i][1] - v[j][1];
|
||||
vr3 = v[i][2] - v[j][2];
|
||||
|
||||
// normal component
|
||||
|
||||
vnnr = vr1*delx + vr2*dely + vr3*delz;
|
||||
vn1 = delx*vnnr * rsqinv;
|
||||
vn2 = dely*vnnr * rsqinv;
|
||||
vn3 = delz*vnnr * rsqinv;
|
||||
|
||||
// tangential component
|
||||
|
||||
vt1 = vr1 - vn1;
|
||||
vt2 = vr2 - vn2;
|
||||
vt3 = vr3 - vn3;
|
||||
|
||||
// relative rotational velocity
|
||||
|
||||
wr1 = (radi*omega[i][0] + radj*omega[j][0]) * rinv;
|
||||
wr2 = (radi*omega[i][1] + radj*omega[j][1]) * rinv;
|
||||
wr3 = (radi*omega[i][2] + radj*omega[j][2]) * rinv;
|
||||
|
||||
// meff = effective mass of pair of particles
|
||||
// if I or J part of rigid body, use body mass
|
||||
// if I or J is frozen, meff is other particle
|
||||
|
||||
mi = rmass[i];
|
||||
mj = rmass[j];
|
||||
if (fix_rigid) {
|
||||
if (mass_rigid[i] > 0.0) mi = mass_rigid[i];
|
||||
if (mass_rigid[j] > 0.0) mj = mass_rigid[j];
|
||||
}
|
||||
|
||||
meff = mi*mj / (mi+mj);
|
||||
if (mask[i] & freeze_group_bit) meff = mj;
|
||||
if (mask[j] & freeze_group_bit) meff = mi;
|
||||
|
||||
// normal forces = Hookian contact + normal velocity damping
|
||||
|
||||
damp = meff*gamman*vnnr*rsqinv;
|
||||
ccel = kn*(radsum-r)*rinv - damp;
|
||||
if (limit_damping && (ccel < 0.0)) ccel = 0.0;
|
||||
|
||||
// relative velocities
|
||||
|
||||
vtr1 = vt1 - (delz*wr2-dely*wr3);
|
||||
vtr2 = vt2 - (delx*wr3-delz*wr1);
|
||||
vtr3 = vt3 - (dely*wr1-delx*wr2);
|
||||
vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
|
||||
vrel = sqrt(vrel);
|
||||
|
||||
// force normalization
|
||||
|
||||
fn = xmu * fabs(ccel*r);
|
||||
fs = meff*gammat*vrel;
|
||||
if (vrel != 0.0) ft = MIN(fn,fs) / vrel;
|
||||
else ft = 0.0;
|
||||
|
||||
// tangential force due to tangential velocity damping
|
||||
|
||||
fs1 = -ft*vtr1;
|
||||
fs2 = -ft*vtr2;
|
||||
fs3 = -ft*vtr3;
|
||||
|
||||
// forces & torques
|
||||
|
||||
fx = delx*ccel + fs1;
|
||||
fy = dely*ccel + fs2;
|
||||
fz = delz*ccel + fs3;
|
||||
fx *= factor_lj;
|
||||
fy *= factor_lj;
|
||||
fz *= factor_lj;
|
||||
f[i][0] += fx;
|
||||
f[i][1] += fy;
|
||||
f[i][2] += fz;
|
||||
|
||||
tor1 = rinv * (dely*fs3 - delz*fs2);
|
||||
tor2 = rinv * (delz*fs1 - delx*fs3);
|
||||
tor3 = rinv * (delx*fs2 - dely*fs1);
|
||||
tor1 *= factor_lj;
|
||||
tor2 *= factor_lj;
|
||||
tor3 *= factor_lj;
|
||||
torque[i][0] -= radi*tor1;
|
||||
torque[i][1] -= radi*tor2;
|
||||
torque[i][2] -= radi*tor3;
|
||||
|
||||
if (newton_pair || j < nlocal) {
|
||||
f[j][0] -= fx;
|
||||
f[j][1] -= fy;
|
||||
f[j][2] -= fz;
|
||||
torque[j][0] -= radj*tor1;
|
||||
torque[j][1] -= radj*tor2;
|
||||
torque[j][2] -= radj*tor3;
|
||||
}
|
||||
|
||||
if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
|
||||
0.0,0.0,fx,fy,fz,delx,dely,delz);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (vflag_fdotr) virial_fdotr_compute();
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
double PairGranHookeGPU::memory_usage()
|
||||
{
|
||||
double bytes = Pair::memory_usage();
|
||||
return bytes + gran_hooke_gpu_bytes();
|
||||
}
|
||||
46
src/GPU/pair_gran_hooke_gpu.h
Normal file
46
src/GPU/pair_gran_hooke_gpu.h
Normal file
@ -0,0 +1,46 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
https://www.lammps.org/, Sandia National Laboratories
|
||||
LAMMPS development team: developers@lammps.org
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef PAIR_CLASS
|
||||
// clang-format off
|
||||
PairStyle(gran/hooke/gpu,PairGranHookeGPU);
|
||||
// clang-format on
|
||||
#else
|
||||
|
||||
#ifndef LMP_PAIR_GRAN_HOOKE_GPU_H
|
||||
#define LMP_PAIR_GRAN_HOOKE_GPU_H
|
||||
|
||||
#include "pair_gran_hooke.h"
|
||||
#include "fix_neigh_history.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class PairGranHookeGPU : public PairGranHooke {
|
||||
public:
|
||||
PairGranHookeGPU(LAMMPS *lmp);
|
||||
~PairGranHookeGPU() override;
|
||||
void cpu_compute(int, int, int, int, int *, int *, int **);
|
||||
void compute(int, int) override;
|
||||
void init_style() override;
|
||||
double memory_usage() override;
|
||||
|
||||
enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
|
||||
|
||||
private:
|
||||
int gpu_mode;
|
||||
double cpu_time;
|
||||
};
|
||||
|
||||
} // namespace LAMMPS_NS
|
||||
#endif
|
||||
#endif
|
||||
Reference in New Issue
Block a user