diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp new file mode 100644 index 0000000000..0efb68a9fb --- /dev/null +++ b/lib/gpu/lal_base_dpd.cpp @@ -0,0 +1,307 @@ +/*************************************************************************** + base_dpd.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Base class for pair styles needing per-particle data for position, + dipole, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Jan 15, 2014 + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include "lal_base_dpd.h" +using namespace LAMMPS_AL; +#define BaseDPDT BaseDPD + +extern Device global_device; + +template +BaseDPDT::BaseDPD() : _compiled(false), _max_bytes(0) { + device=&global_device; + ans=new Answer(); + nbor=new Neighbor(); +} + +template +BaseDPDT::~BaseDPD() { + delete ans; + delete nbor; +} + +template +int BaseDPDT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template +int BaseDPDT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen, + const void *pair_program, const char *k_name) { + screen=_screen; + + int gpu_nbor=0; + if (device->gpu_mode()==Device::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_charge(); + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, + maxspecial,_gpu_host,max_nbors,cell_size,false, + _threads_per_atom,true); + if (success!=0) + return success; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + _block_bio_size=device->block_bio_pair(); + compile_kernels(*ucl_device,pair_program,k_name); + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->x,4); + vel_tex.bind_float(atom->v,4); + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + + return success; +} + +template +void BaseDPDT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); +} + +template +void BaseDPDT::clear_atomic() { + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); + + if (_compiled) { + k_pair_fast.clear(); + k_pair.clear(); + delete pair_program; + _compiled=false; + } + + time_pair.clear(); + hd_balancer.clear(); + + nbor->clear(); + ans->clear(); + device->clear(); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template +int * BaseDPDT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(inum,numj,ilist); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return NULL; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template +inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + bool &success) { + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, + nspecial, special, success, mn); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template +void BaseDPDT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, tagint *tag, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, + const int nlocal, double *boxlo, double *prd) { + acc_timers(); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + atom->cast_x_data(host_x,host_type); + atom->cast_v_data(host_v,tag); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + atom->add_v_data(host_v,tag); + + _dtinvsqrt = dtinvsqrt; + _seed = seed; + _timestep = timestep; + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + device->add_ans_object(ans); + hd_balancer.stop_timer(); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template +int** BaseDPDT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + double *boxlo, double *prd) { + acc_timers(); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return NULL; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return NULL; + atom->cast_v_data(host_v,tag); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_v_data(host_v,tag); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + atom->add_v_data(host_v,tag); + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + _dtinvsqrt = dtinvsqrt; + _seed = seed; + _timestep = timestep; + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; +} + +template +double BaseDPDT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseDPD); +} + +template +void BaseDPDT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *kname) { + if (_compiled) + return; + + std::string s_fast=std::string(kname)+"_fast"; + pair_program=new UCL_Program(dev); + pair_program->load_string(pair_str,device->compile_string().c_str()); + k_pair_fast.set_function(*pair_program,s_fast.c_str()); + k_pair.set_function(*pair_program,kname); + pos_tex.get_texture(*pair_program,"pos_tex"); + vel_tex.get_texture(*pair_program,"vel_tex"); + + _compiled=true; +} + +template class BaseDPD; + diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h new file mode 100644 index 0000000000..97640ed40e --- /dev/null +++ b/lib/gpu/lal_base_dpd.h @@ -0,0 +1,203 @@ +/*************************************************************************** + base_dpd.h + ------------------- + Trung Dac Nguyen (ORNL) + + Base class for pair styles needing per-particle data for position, + velocity, tag, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Jan 15, 2014 + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_BASE_DPD_H +#define LAL_BASE_DPD_H + +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +namespace LAMMPS_AL { + +template +class BaseDPD { + public: + BaseDPD(); + virtual ~BaseDPD(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \param k_name name for the kernel for force calculation + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const void *pair_program, const char *k_name); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) { + pos_tex.bind_float(atom->x,4); + vel_tex.bind_float(atom->v,4); + } + ans->resize(inum,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(); + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_pair.zero(); + atom->zero_timers(); + ans->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, bool &success); + + /// Pair loop with host neighboring + void compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **v, const double dtinvsqrt, const int seed, + const int timestep, const int nlocal, double *boxlo, double *prd); + + /// Pair loop with device neighboring + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double **v, const double dtinvsqrt, const int seed, + const int timestep, double *boxlo, double *prd); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + Balance hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom *atom; + + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + Answer *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + Neighbor *nbor; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program; + UCL_Kernel k_pair_fast, k_pair; + inline int block_size() { return _block_size; } + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture vel_tex; + + // ------------------------- COMMON VARS ---------------------------- + numtyp _dtinvsqrt; + int _seed, _timestep; + + protected: + bool _compiled; + int _block_size, _block_bio_size, _threads_per_atom; + double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; + + void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); + virtual void loop(const bool _eflag, const bool _vflag) = 0; +}; + +} + +#endif diff --git a/lib/gpu/lal_dpd.cpp b/lib/gpu/lal_dpd.cpp new file mode 100644 index 0000000000..3736f89323 --- /dev/null +++ b/lib/gpu/lal_dpd.cpp @@ -0,0 +1,170 @@ +/*************************************************************************** + dpd.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the dpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Jan 15, 2014 + email : nguyentd@ornl.gov + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "dpd_cl.h" +#elif defined(USE_CUDART) +const char *dpd=0; +#else +#include "dpd_cubin.h" +#endif + +#include "lal_dpd.h" +#include +using namespace LAMMPS_AL; +#define DPDT DPD + +extern Device device; + +template +DPDT::DPD() : BaseDPD(), _allocated(false) { +} + +template +DPDT::~DPD() { + clear(); +} + +template +int DPDT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int DPDT::init(const int ntypes, + double **host_cutsq, double **host_a0, + double **host_gamma, double **host_sigma, + double **host_cut, double *host_special_lj, + const bool tstat_only, + const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,_screen,dpd,"k_dpd"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a0,host_gamma, + host_sigma,host_cut); + + UCL_H_Vec host_rsq(lj_types*lj_types,*(this->ucl_device), + UCL_WRITE_ONLY); + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _tstat_only = 0; + if (tstat_only) _tstat_only=1; + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+cutsq.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void DPDT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + cutsq.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double DPDT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(DPD); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void DPDT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &coeff, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->v, &cutsq, + &this->_dtinvsqrt, &this->_seed, &this->_timestep, + &this->_tstat_only, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt, + &this->_seed, &this->_timestep, &this->_tstat_only, + &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template +void DPDT::update_coeff(int ntypes, double **host_a0, double **host_gamma, + double **host_sigma, double **host_cut) +{ + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_a0,host_gamma, + host_sigma,host_cut); +} + +template class DPD; diff --git a/lib/gpu/lal_dpd.h b/lib/gpu/lal_dpd.h new file mode 100644 index 0000000000..449d7b1d8c --- /dev/null +++ b/lib/gpu/lal_dpd.h @@ -0,0 +1,86 @@ +/*************************************************************************** + dpd.h + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the dpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Jan 15, 2014 + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_DPD_H +#define LAL_DPD_H + +#include "lal_base_dpd.h" + +namespace LAMMPS_AL { + +template +class DPD : public BaseDPD { + public: + DPD(); + ~DPD(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_a0, + double **host_gamma, double **host_sigma, double **host_cut, + double *host_special_lj, bool tstat_only, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + /// Update coeff if needed (tstat only) + void update_coeff(int ntypes, double **host_a0, double **host_gamma, + double **host_sigma, double **host_cut); + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = a0, coeff.y = gamma, coeff.z = sigma, coeff.w = cut + UCL_D_Vec coeff; + + UCL_D_Vec cutsq; + + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// Only used for thermostat + int _tstat_only; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_dpd_ext.cpp b/lib/gpu/lal_dpd_ext.cpp new file mode 100644 index 0000000000..327074d087 --- /dev/null +++ b/lib/gpu/lal_dpd_ext.cpp @@ -0,0 +1,133 @@ +/*************************************************************************** + dpd_ext.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Functions for LAMMPS access to dpd acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Jan 15, 2014 + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_dpd.h" + +using namespace std; +using namespace LAMMPS_AL; + +static DPD DPDMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, + double **host_gamma, double **host_sigma, double **host_cut, + double *special_lj, bool tstat_only, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + DPDMF.clear(); + gpu_mode=DPDMF.device->gpu_mode(); + double gpu_split=DPDMF.device->particle_split(); + int first_gpu=DPDMF.device->first_device(); + int last_gpu=DPDMF.device->last_device(); + int world_me=DPDMF.device->world_me(); + int gpu_rank=DPDMF.device->gpu_rank(); + int procs_per_gpu=DPDMF.device->procs_per_gpu(); + + DPDMF.device->init_message(screen,"dpd",first_gpu,last_gpu); + + bool message=false; + if (DPDMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, + host_cut, special_lj, tstat_only, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + + DPDMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + DPDMF.estimate_gpu_overhead(); + return init_ok; +} + +void dpd_gpu_clear() { + DPDMF.clear(); +} + +int ** dpd_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + double *boxlo, double *prd) { + return DPDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_v, dtinvsqrt, seed, timestep, boxlo, prd); +} + +void dpd_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + const int nlocal, double *boxlo, double *prd) { + DPDMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); +} + +void dpd_gpu_update_coeff(int ntypes, double **host_a0, double **host_gamma, + double **host_sigma, double **host_cut) +{ + DPDMF.update_coeff(ntypes,host_a0,host_gamma,host_sigma,host_cut); +} + +double dpd_gpu_bytes() { + return DPDMF.host_memory_usage(); +} + +