From b56bdd2d7bf3dac72bd98e713433ffa0152ac115 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 24 Jun 2016 13:55:14 -0400 Subject: [PATCH] remove trailing whitespace in gpu library --- lib/gpu/lal_answer.cpp | 46 ++-- lib/gpu/lal_atom.cpp | 46 ++-- lib/gpu/lal_atom.cu | 6 +- lib/gpu/lal_atom.h | 60 +++--- lib/gpu/lal_balance.h | 18 +- lib/gpu/lal_base_atomic.cpp | 20 +- lib/gpu/lal_base_atomic.h | 22 +- lib/gpu/lal_base_charge.cpp | 16 +- lib/gpu/lal_base_charge.h | 12 +- lib/gpu/lal_base_dipole.cpp | 18 +- lib/gpu/lal_base_dipole.h | 12 +- lib/gpu/lal_base_dpd.cpp | 18 +- lib/gpu/lal_base_dpd.h | 12 +- lib/gpu/lal_base_ellipsoid.cpp | 34 +-- lib/gpu/lal_base_ellipsoid.h | 28 +-- lib/gpu/lal_beck.cpp | 10 +- lib/gpu/lal_beck.cu | 26 +-- lib/gpu/lal_beck.h | 12 +- lib/gpu/lal_beck_ext.cpp | 6 +- lib/gpu/lal_born.cpp | 22 +- lib/gpu/lal_born.cu | 58 ++--- lib/gpu/lal_born.h | 20 +- lib/gpu/lal_born_coul_long.cpp | 26 +-- lib/gpu/lal_born_coul_long.cu | 268 +++++++++++------------ lib/gpu/lal_born_coul_long.h | 20 +- lib/gpu/lal_born_coul_long_ext.cpp | 30 +-- lib/gpu/lal_born_coul_wolf.cpp | 28 +-- lib/gpu/lal_born_coul_wolf.cu | 64 +++--- lib/gpu/lal_born_coul_wolf.h | 20 +- lib/gpu/lal_born_coul_wolf_ext.cpp | 22 +- lib/gpu/lal_born_ext.cpp | 28 +-- lib/gpu/lal_buck.cpp | 28 +-- lib/gpu/lal_buck.cu | 54 ++--- lib/gpu/lal_buck.h | 18 +- lib/gpu/lal_buck_coul.cpp | 28 +-- lib/gpu/lal_buck_coul.cu | 80 +++---- lib/gpu/lal_buck_coul.h | 20 +- lib/gpu/lal_buck_coul_ext.cpp | 22 +- lib/gpu/lal_buck_coul_long.cpp | 24 +-- lib/gpu/lal_buck_coul_long.cu | 276 ++++++++++++------------ lib/gpu/lal_buck_coul_long.h | 14 +- lib/gpu/lal_buck_coul_long_ext.cpp | 18 +- lib/gpu/lal_buck_ext.cpp | 22 +- lib/gpu/lal_cg_cmm.cpp | 22 +- lib/gpu/lal_cg_cmm.cu | 44 ++-- lib/gpu/lal_cg_cmm.h | 10 +- lib/gpu/lal_cg_cmm_ext.cpp | 12 +- lib/gpu/lal_cg_cmm_long.cpp | 24 +-- lib/gpu/lal_cg_cmm_long.cu | 38 ++-- lib/gpu/lal_cg_cmm_long.h | 12 +- lib/gpu/lal_cg_cmm_long_ext.cpp | 14 +- lib/gpu/lal_charmm_long.cpp | 20 +- lib/gpu/lal_charmm_long.cu | 42 ++-- lib/gpu/lal_charmm_long.h | 12 +- lib/gpu/lal_charmm_long_ext.cpp | 10 +- lib/gpu/lal_colloid.cpp | 30 +-- lib/gpu/lal_colloid.cu | 112 +++++----- lib/gpu/lal_colloid.h | 20 +- lib/gpu/lal_colloid_ext.cpp | 20 +- lib/gpu/lal_coul.cpp | 18 +- lib/gpu/lal_coul.cu | 38 ++-- lib/gpu/lal_coul.h | 12 +- lib/gpu/lal_coul_debye.cpp | 16 +- lib/gpu/lal_coul_debye.cu | 26 +-- lib/gpu/lal_coul_debye.h | 12 +- lib/gpu/lal_coul_debye_ext.cpp | 14 +- lib/gpu/lal_coul_dsf.cpp | 18 +- lib/gpu/lal_coul_dsf.cu | 56 ++--- lib/gpu/lal_coul_dsf.h | 10 +- lib/gpu/lal_coul_dsf_ext.cpp | 20 +- lib/gpu/lal_coul_ext.cpp | 16 +- lib/gpu/lal_coul_long.cpp | 12 +- lib/gpu/lal_coul_long.cu | 18 +- lib/gpu/lal_coul_long.h | 6 +- lib/gpu/lal_coul_long_ext.cpp | 8 +- lib/gpu/lal_device.cpp | 108 +++++----- lib/gpu/lal_device.cu | 6 +- lib/gpu/lal_device.h | 68 +++--- lib/gpu/lal_dipole_lj.cpp | 16 +- lib/gpu/lal_dipole_lj.cu | 88 ++++---- lib/gpu/lal_dipole_lj.h | 8 +- lib/gpu/lal_dipole_lj_ext.cpp | 10 +- lib/gpu/lal_dipole_lj_sf.cpp | 20 +- lib/gpu/lal_dipole_lj_sf.cu | 122 +++++------ lib/gpu/lal_dipole_lj_sf.h | 8 +- lib/gpu/lal_dipole_lj_sf_ext.cpp | 10 +- lib/gpu/lal_dpd.cpp | 26 +-- lib/gpu/lal_dpd.cu | 84 ++++---- lib/gpu/lal_dpd.h | 18 +- lib/gpu/lal_dpd_ext.cpp | 20 +- lib/gpu/lal_eam.cpp | 134 ++++++------ lib/gpu/lal_eam.cu | 136 ++++++------ lib/gpu/lal_eam.h | 54 ++--- lib/gpu/lal_eam_alloy_ext.cpp | 32 +-- lib/gpu/lal_eam_ext.cpp | 32 +-- lib/gpu/lal_eam_fs_ext.cpp | 32 +-- lib/gpu/lal_ellipsoid_extra.h | 14 +- lib/gpu/lal_ellipsoid_nbor.cu | 34 +-- lib/gpu/lal_gauss.cpp | 18 +- lib/gpu/lal_gauss.cu | 60 +++--- lib/gpu/lal_gauss.h | 18 +- lib/gpu/lal_gauss_ext.cpp | 22 +- lib/gpu/lal_gayberne.cpp | 66 +++--- lib/gpu/lal_gayberne.cu | 72 +++---- lib/gpu/lal_gayberne.h | 26 +-- lib/gpu/lal_gayberne_ext.cpp | 20 +- lib/gpu/lal_gayberne_lj.cu | 130 ++++++------ lib/gpu/lal_lj.cpp | 26 +-- lib/gpu/lal_lj.cu | 60 +++--- lib/gpu/lal_lj.h | 16 +- lib/gpu/lal_lj96.cpp | 14 +- lib/gpu/lal_lj96.cu | 50 ++--- lib/gpu/lal_lj96.h | 10 +- lib/gpu/lal_lj96_ext.cpp | 6 +- lib/gpu/lal_lj_class2_long.cpp | 12 +- lib/gpu/lal_lj_class2_long.cu | 42 ++-- lib/gpu/lal_lj_class2_long.h | 8 +- lib/gpu/lal_lj_class2_long_ext.cpp | 6 +- lib/gpu/lal_lj_coul.cpp | 20 +- lib/gpu/lal_lj_coul.cu | 46 ++-- lib/gpu/lal_lj_coul.h | 8 +- lib/gpu/lal_lj_coul_debye.cpp | 18 +- lib/gpu/lal_lj_coul_debye.cu | 42 ++-- lib/gpu/lal_lj_coul_debye.h | 8 +- lib/gpu/lal_lj_coul_debye_ext.cpp | 10 +- lib/gpu/lal_lj_coul_ext.cpp | 8 +- lib/gpu/lal_lj_coul_long.cpp | 18 +- lib/gpu/lal_lj_coul_long.cu | 38 ++-- lib/gpu/lal_lj_coul_long.h | 10 +- lib/gpu/lal_lj_coul_long_ext.cpp | 16 +- lib/gpu/lal_lj_coul_msm.cpp | 20 +- lib/gpu/lal_lj_coul_msm.cu | 30 +-- lib/gpu/lal_lj_coul_msm.h | 14 +- lib/gpu/lal_lj_coul_msm_ext.cpp | 8 +- lib/gpu/lal_lj_cubic.cpp | 22 +- lib/gpu/lal_lj_cubic.cu | 64 +++--- lib/gpu/lal_lj_cubic.h | 16 +- lib/gpu/lal_lj_cubic_ext.cpp | 14 +- lib/gpu/lal_lj_dsf.cpp | 20 +- lib/gpu/lal_lj_dsf.cu | 46 ++-- lib/gpu/lal_lj_dsf.h | 8 +- lib/gpu/lal_lj_dsf_ext.cpp | 8 +- lib/gpu/lal_lj_expand.cpp | 24 +-- lib/gpu/lal_lj_expand.cu | 58 ++--- lib/gpu/lal_lj_expand.h | 14 +- lib/gpu/lal_lj_expand_ext.cpp | 12 +- lib/gpu/lal_lj_ext.cpp | 12 +- lib/gpu/lal_lj_gromacs.cpp | 22 +- lib/gpu/lal_lj_gromacs.cu | 26 +-- lib/gpu/lal_lj_gromacs.h | 12 +- lib/gpu/lal_lj_gromacs_ext.cpp | 12 +- lib/gpu/lal_mie.cpp | 12 +- lib/gpu/lal_mie.cu | 42 ++-- lib/gpu/lal_mie.h | 12 +- lib/gpu/lal_mie_ext.cpp | 6 +- lib/gpu/lal_morse.cpp | 24 +-- lib/gpu/lal_morse.cu | 48 ++--- lib/gpu/lal_morse.h | 12 +- lib/gpu/lal_morse_ext.cpp | 12 +- lib/gpu/lal_neighbor_cpu.cu | 6 +- lib/gpu/lal_neighbor_gpu.cu | 72 +++---- lib/gpu/lal_neighbor_shared.cpp | 2 +- lib/gpu/lal_neighbor_shared.h | 6 +- lib/gpu/lal_pppm.cpp | 44 ++-- lib/gpu/lal_pppm.cu | 50 ++--- lib/gpu/lal_pppm.h | 28 +-- lib/gpu/lal_pppm_ext.cpp | 18 +- lib/gpu/lal_precision.h | 8 +- lib/gpu/lal_preprocessor.h | 20 +- lib/gpu/lal_re_squared.cpp | 52 ++--- lib/gpu/lal_re_squared.cu | 40 ++-- lib/gpu/lal_re_squared.h | 20 +- lib/gpu/lal_re_squared_ext.cpp | 20 +- lib/gpu/lal_re_squared_lj.cu | 140 ++++++------ lib/gpu/lal_soft.cpp | 14 +- lib/gpu/lal_soft.cu | 32 +-- lib/gpu/lal_soft.h | 14 +- lib/gpu/lal_soft_ext.cpp | 14 +- lib/gpu/lal_sw_ext.cpp | 24 +-- lib/gpu/lal_table.cpp | 88 ++++---- lib/gpu/lal_table.cu | 328 ++++++++++++++--------------- lib/gpu/lal_table.h | 36 ++-- lib/gpu/lal_table_ext.cpp | 12 +- lib/gpu/lal_yukawa.cpp | 16 +- lib/gpu/lal_yukawa.cu | 52 ++--- lib/gpu/lal_yukawa.h | 16 +- lib/gpu/lal_yukawa_colloid.cpp | 62 +++--- lib/gpu/lal_yukawa_colloid.cu | 62 +++--- lib/gpu/lal_yukawa_colloid.h | 28 +-- lib/gpu/lal_yukawa_colloid_ext.cpp | 24 +-- lib/gpu/lal_yukawa_ext.cpp | 20 +- lib/gpu/lal_zbl.cpp | 24 +-- lib/gpu/lal_zbl.cu | 76 +++---- lib/gpu/lal_zbl.h | 20 +- lib/gpu/lal_zbl_ext.cpp | 18 +- 195 files changed, 3257 insertions(+), 3257 deletions(-) diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp index dd0b5d2424..bd8c7ef843 100644 --- a/lib/gpu/lal_answer.cpp +++ b/lib/gpu/lal_answer.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -24,7 +24,7 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false), } template -int AnswerT::bytes_per_atom() const { +int AnswerT::bytes_per_atom() const { int bytes=11*sizeof(acctyp); if (_rot) bytes+=4*sizeof(acctyp); @@ -38,19 +38,19 @@ bool AnswerT::alloc(const int inum) { _max_local=static_cast(static_cast(inum)*1.10); bool success=true; - + _ans_fields=4; if (_rot) _ans_fields+=4; - + // --------------------------- Device allocations success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY, UCL_READ_WRITE)==UCL_SUCCESS); success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY, UCL_READ_WRITE)==UCL_SUCCESS); _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes(); - - _allocated=true; + + _allocated=true; return success; } @@ -69,21 +69,21 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot, if (_charge) _e_fields++; _ev_fields=6+_e_fields; - + // Initialize atom and nbor data int ef_inum=inum; if (ef_inum==0) ef_inum=1000; - + // Initialize timers for the selected device time_answer.init(*dev); time_answer.zero(); _time_cast=0.0; _time_cpu_idle=0.0; - + return success && alloc(ef_inum); } - + template bool AnswerT::add_fields(const bool charge, const bool rot) { bool realloc=false; @@ -127,15 +127,15 @@ void AnswerT::clear() { template double AnswerT::host_memory_usage() const { int atom_bytes=4; - if (_charge) + if (_charge) atom_bytes+=1; - if (_rot) + if (_rot) atom_bytes+=4; int ans_bytes=atom_bytes+_ev_fields; return ans_bytes*(_max_local)*sizeof(acctyp)+ sizeof(Answer); } - + template void AnswerT::copy_answers(const bool eflag, const bool vflag, const bool ef_atom, const bool vf_atom) { @@ -144,8 +144,8 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag, _vflag=vflag; _ef_atom=ef_atom; _vf_atom=vf_atom; - - int csize=_ev_fields; + + int csize=_ev_fields; if (!eflag) csize-=_e_fields; if (!vflag) @@ -180,7 +180,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom, for (int i=0; i<_inum; i++) evdwl+=engv[i]; if (_ef_atom) - if (_ilist==NULL) + if (_ilist==NULL) for (int i=0; i<_inum; i++) eatom[i]+=engv[i]; else @@ -196,18 +196,18 @@ double AnswerT::energy_virial(double *eatom, double **vatom, if (_vf_atom) if (_ilist==NULL) { int ii=0; - for (int i=vstart; i -int AtomT::bytes_per_atom() const { +int AtomT::bytes_per_atom() const { int id_space=0; if (_gpu_nbor==1) id_space=2; @@ -51,7 +51,7 @@ bool AtomT::alloc(const int nall) { _max_atoms=static_cast(static_cast(nall)*1.10); bool success=true; - + // Ignore host/device transfers? _host_view=false; if (dev->shared_memory() && sizeof(numtyp)==sizeof(double)) { @@ -60,11 +60,11 @@ bool AtomT::alloc(const int nall) { assert(0==1); #endif } - + // Allocate storage for CUDPP sort #ifdef USE_CUDPP if (_gpu_nbor==1) { - CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); + CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); if (CUDPP_SUCCESS != result) return false; } @@ -110,7 +110,7 @@ bool AtomT::alloc(const int nall) { } else { success=success && (host_particle_id.alloc(_max_atoms,*dev, UCL_WRITE_ONLY)==UCL_SUCCESS); - success=success && + success=success && (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); } if (_gpu_nbor==2 && _host_view) @@ -124,8 +124,8 @@ bool AtomT::alloc(const int nall) { gpu_bytes+=x.device.row_bytes(); if (gpu_bytes>_max_gpu_bytes) _max_gpu_bytes=gpu_bytes; - - _allocated=true; + + _allocated=true; return success; } @@ -135,7 +135,7 @@ bool AtomT::add_fields(const bool charge, const bool rot, bool success=true; // Ignore host/device transfers? int gpu_bytes=0; - + if (charge && _charge==false) { _charge=true; _other=true; @@ -179,7 +179,7 @@ bool AtomT::add_fields(const bool charge, const bool rot, _gpu_nbor=gpu_nbor; #ifdef USE_CUDPP if (_gpu_nbor==1) { - CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); + CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); if (CUDPP_SUCCESS != result) return false; } @@ -198,9 +198,9 @@ bool AtomT::add_fields(const bool charge, const bool rot, } else { success=success && (host_particle_id.alloc(_max_atoms,*dev, UCL_WRITE_ONLY)==UCL_SUCCESS); - success=success && + success=success && (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); - } + } } return success; @@ -230,7 +230,7 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, int ef_nall=nall; if (ef_nall==0) ef_nall=2000; - + // Initialize timers for the selected device time_pos.init(*dev); time_q.init(*dev); @@ -241,14 +241,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, time_quat.zero(); time_vel.zero(); _time_cast=0.0; - + #ifdef GPU_CAST compile_kernels(*dev); #endif - + return success && alloc(ef_nall); } - + template void AtomT::clear_resize() { if (!_allocated) @@ -274,7 +274,7 @@ void AtomT::clear_resize() { #ifdef USE_CUDPP if (_gpu_nbor==1) cudppDestroyPlan(sort_plan); #endif - + if (_gpu_nbor==2) { host_particle_id.clear(); host_cell_id.clear(); @@ -305,21 +305,21 @@ void AtomT::clear() { template double AtomT::host_memory_usage() const { int atom_bytes=4; - if (_charge) + if (_charge) atom_bytes+=1; - if (_rot) + if (_rot) atom_bytes+=4; - if (_vel) + if (_vel) atom_bytes+=4; return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom); } - + // Sort arrays for neighbor list calculation template void AtomT::sort_neighbor(const int num_atoms) { #ifdef USE_CUDPP - CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), - (int *)dev_particle_id.begin(), + CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), + (int *)dev_particle_id.begin(), 8*sizeof(unsigned), num_atoms); if (CUDPP_SUCCESS != result) { printf("Error in cudppSort\n"); diff --git a/lib/gpu/lal_atom.cu b/lib/gpu/lal_atom.cu index 2a78719ffb..28ff31c566 100644 --- a/lib/gpu/lal_atom.cu +++ b/lib/gpu/lal_atom.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -17,9 +17,9 @@ #include "lal_preprocessor.h" #endif -__kernel void kernel_cast_x(__global numtyp4 *restrict x_type, +__kernel void kernel_cast_x(__global numtyp4 *restrict x_type, const __global double *restrict x, - const __global int *restrict type, + const __global int *restrict type, const int nall) { int ii=GLOBAL_ID_X; diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 23112fe712..1b4e17d972 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -57,19 +57,19 @@ class Atom { /// Set number of local+ghost atoms for future copy operations inline void nall(const int n) { _nall=n; } - + /// Memory usage per atom in this class - int bytes_per_atom() const; + int bytes_per_atom() const; /// Clear any previous data and set up for a new LAMMPS run /** \param rot True if atom storage needs quaternions * \param gpu_nbor 0 if neighboring will be performed on host * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device **/ - bool init(const int nall, const bool charge, const bool rot, - UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, + bool init(const int nall, const bool charge, const bool rot, + UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, const bool vel=false); - + /// Check if we have enough device storage and realloc if not /** Returns true if resized with any call during this timestep **/ inline bool resize(const int nall, bool &success) { @@ -81,7 +81,7 @@ class Atom { } return _resized; } - + /// If already initialized by another LAMMPS style, add fields as necessary /** \param rot True if atom storage needs quaternions * \param gpu_nbor 0 if neighboring will be performed on host @@ -89,28 +89,28 @@ class Atom { * gpu_nbor 2 if binning on host and neighboring on device **/ bool add_fields(const bool charge, const bool rot, const int gpu_nbor, const bool bonds, const bool vel=false); - + /// Returns true if GPU is using charges bool charge() { return _charge; } - + /// Returns true if GPU is using quaternions bool quaternion() { return _rot; } - + /// Returns true if GPU is using velocities bool velocity() { return _vel; } /// Only free matrices of length inum or nall for resizing void clear_resize(); - + /// Free all memory on host and device void clear(); - + /// Return the total amount of host memory used by class in bytes double host_memory_usage() const; /// Sort arrays for neighbor list calculation on device void sort_neighbor(const int num_atoms); - + /// Add copy times to timers inline void acc_timers() { time_pos.add_to_total(); @@ -150,18 +150,18 @@ class Atom { total+=time_vel.total_seconds(); time_vel.zero_total(); } - + return total+_time_transfer/1000.0; } - + /// Return the total time for data cast/pack /** Zeros the time so that atom times are only included once **/ - inline double cast_time() + inline double cast_time() { double t=_time_cast; _time_cast=0.0; return t; } /// Pack LAMMPS atom type constants into matrix and copy to device template - inline void type_pack1(const int n, const int m_size, + inline void type_pack1(const int n, const int m_size, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one) { int ii=0; @@ -215,7 +215,7 @@ class Atom { view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } - + /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device template inline void type_pack4(const int n, const int m_size, @@ -239,7 +239,7 @@ class Atom { /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device template - inline void self_pack2(const int n, UCL_D_Vec &dev_v, + inline void self_pack2(const int n, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one, t2 **two) { for (int i=0; i(one[i][i]); @@ -279,7 +279,7 @@ class Atom { /// Copy positions and types to device asynchronously /** Copies nall() elements **/ - inline void add_x_data(double **host_ptr, int *host_type) { + inline void add_x_data(double **host_ptr, int *host_type) { time_pos.start(); if (_x_avail==false) { #ifdef GPU_CAST @@ -376,7 +376,7 @@ class Atom { /// Copy velocities and tags to device asynchronously /** Copies nall() elements **/ - inline void add_v_data(double **host_ptr, tagint *host_tag) { + inline void add_v_data(double **host_ptr, tagint *host_tag) { time_vel.start(); if (_v_avail==false) { #ifdef GPU_CAST @@ -407,8 +407,8 @@ class Atom { inline void add_transfer_time(double t) { _time_transfer+=t; } /// Return number of bytes used on device - inline double max_gpu_bytes() - { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } + inline double max_gpu_bytes() + { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } /// Returns true if the device is addressing memory on the host inline bool host_view() { return _host_view; } @@ -422,7 +422,7 @@ class Atom { /// Quaterions UCL_Vector quat; /// Velocities - UCL_Vector v; + UCL_Vector v; #ifdef GPU_CAST UCL_Vector x_cast; @@ -436,7 +436,7 @@ class Atom { /// Atom tag information for device nbor builds UCL_D_Vec dev_tag; - + /// Cell list identifiers for hybrid nbor builds UCL_H_Vec host_cell_id; /// Cell list identifiers for hybrid nbor builds @@ -444,7 +444,7 @@ class Atom { /// Device timers UCL_Timer time_pos, time_q, time_quat, time_vel; - + /// Geryon device UCL_Device *dev; @@ -456,19 +456,19 @@ class Atom { #endif bool _compiled; - + // True if data has been copied to device already bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized; bool alloc(const int nall); - + bool _allocated, _rot, _charge, _bonds, _vel, _other; int _max_atoms, _nall, _gpu_nbor; bool _host_view; double _time_cast, _time_transfer; - + double _max_gpu_bytes; - + #ifdef USE_CUDPP CUDPPConfiguration sort_config; CUDPPHandle sort_plan; diff --git a/lib/gpu/lal_balance.h b/lib/gpu/lal_balance.h index cf09cf86fb..e90e94bee1 100644 --- a/lib/gpu/lal_balance.h +++ b/lib/gpu/lal_balance.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -44,7 +44,7 @@ class Balance { _init_done=false; } } - + /// Return the timestep since initialization inline int timestep() { return _timestep; } @@ -96,7 +96,7 @@ class Balance { inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } } /// Calculate the new host/device split based on the cpu and device times - /** \note Only does calculation every _HD_BALANCE_EVERY timesteps + /** \note Only does calculation every _HD_BALANCE_EVERY timesteps (and first 10) **/ inline void balance(const double cpu_time); @@ -105,13 +105,13 @@ class Balance { balance(cpu_time); return get_gpu_count(ago,inum_full); } - + private: Device *_device; UCL_Timer _device_time; bool _init_done; int _gpu_nbor; - + bool _load_balance; double _actual_split, _avg_split, _desired_split, _max_split; int _avg_count; @@ -123,15 +123,15 @@ class Balance { #define BalanceT Balance template -void BalanceT::init(Device *gpu, +void BalanceT::init(Device *gpu, const int gpu_nbor, const double split) { clear(); _gpu_nbor=gpu_nbor; _init_done=true; - + _device=gpu; _device_time.init(*gpu->gpu); - + if (split<0.0) { _load_balance=true; _desired_split=0.90; @@ -163,7 +163,7 @@ int BalanceT::get_gpu_count(const int ago, const int inum_full) { _timestep++; return _inum; } - + template void BalanceT::balance(const double cpu_time) { if (_measure_this_step) { diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp index 191f218bd8..e59dae1a6f 100644 --- a/lib/gpu/lal_base_atomic.cpp +++ b/lib/gpu/lal_base_atomic.cpp @@ -9,10 +9,10 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ - + #include "lal_base_atomic.h" using namespace LAMMPS_AL; #define BaseAtomicT BaseAtomic @@ -63,13 +63,13 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall, _nbor_data=&(nbor->dev_packed); } else _nbor_data=&(nbor->dev_nbor); - + int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,false, _threads_per_atom); if (success!=0) return success; - + ucl_device=device->gpu; atom=&device->atom; @@ -139,7 +139,7 @@ int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist, double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; - + return ilist; } @@ -188,7 +188,7 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full, zero_timers(); return; } - + int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time); ans->inum(inum); @@ -217,7 +217,7 @@ template int ** BaseAtomicT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, + int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, @@ -230,12 +230,12 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full, zero_timers(); return NULL; } - + hd_balancer.balance(cpu_time); int inum=hd_balancer.get_gpu_count(ago,inum_full); ans->inum(inum); host_start=inum; - + // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, @@ -255,7 +255,7 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full, ans->copy_answers(eflag,vflag,eatom,vatom); device->add_ans_object(ans); hd_balancer.stop_timer(); - + return nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_base_atomic.h b/lib/gpu/lal_base_atomic.h index eaf55f46e2..e3e9829abc 100644 --- a/lib/gpu/lal_base_atomic.h +++ b/lib/gpu/lal_base_atomic.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -41,7 +41,7 @@ class BaseAtomic { * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * \param k_name name for the kernel for force calculation - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -49,8 +49,8 @@ class BaseAtomic { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init_atomic(const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, const void *pair_program, const char *k_name); /// Estimate the overhead for GPU context changes and CPU driver @@ -80,7 +80,7 @@ class BaseAtomic { * \note host_inum is 0 if the host is performing neighboring * \note nlocal+host_inum=total number local particles * \note olist_size=0 **/ - inline void resize_local(const int inum, const int host_inum, + inline void resize_local(const int inum, const int host_inum, const int max_nbors, bool &success) { nbor->resize(inum,host_inum,max_nbors,success); } @@ -119,7 +119,7 @@ class BaseAtomic { /// Build neighbor list on device void build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success); /// Pair loop with host neighboring @@ -133,19 +133,19 @@ class BaseAtomic { int * compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); /// Pair loop with device neighboring int ** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success); - // -------------------------- DEVICE DATA ------------------------- + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage Device *device; diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index e7fe2b62f4..c6341f7d57 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -10,7 +10,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -64,7 +64,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, _nbor_data=&(nbor->dev_packed); } else _nbor_data=&(nbor->dev_nbor); - + int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,false, _threads_per_atom); @@ -153,7 +153,7 @@ template inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, + double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success) { success=true; @@ -192,7 +192,7 @@ void BaseChargeT::compute(const int f_ago, const int inum_full, zero_timers(); return; } - + int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time); ans->inum(inum); @@ -226,7 +226,7 @@ template int** BaseChargeT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, + int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, @@ -240,12 +240,12 @@ int** BaseChargeT::compute(const int ago, const int inum_full, zero_timers(); return NULL; } - + hd_balancer.balance(cpu_time); int inum=hd_balancer.get_gpu_count(ago,inum_full); ans->inum(inum); host_start=inum; - + // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, @@ -271,7 +271,7 @@ int** BaseChargeT::compute(const int ago, const int inum_full, ans->copy_answers(eflag,vflag,eatom,vatom); device->add_ans_object(ans); hd_balancer.stop_timer(); - + return nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h index e791507432..64c19554b9 100644 --- a/lib/gpu/lal_base_charge.h +++ b/lib/gpu/lal_base_charge.h @@ -10,7 +10,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -42,7 +42,7 @@ class BaseCharge { * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * \param k_name name for the kernel for force calculation - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -83,7 +83,7 @@ class BaseCharge { * \note host_inum is 0 if the host is performing neighboring * \note nlocal+host_inum=total number local particles * \note olist_size=0 **/ - inline void resize_local(const int inum, const int host_inum, + inline void resize_local(const int inum, const int host_inum, const int max_nbors, bool &success) { nbor->resize(inum,host_inum,max_nbors,success); } @@ -137,12 +137,12 @@ class BaseCharge { int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); - // -------------------------- DEVICE DATA ------------------------- + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage Device *device; diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp index 12e3b20d96..478f0092c7 100644 --- a/lib/gpu/lal_base_dipole.cpp +++ b/lib/gpu/lal_base_dipole.cpp @@ -10,7 +10,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -65,7 +65,7 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall, _nbor_data=&(nbor->dev_packed); } else _nbor_data=&(nbor->dev_nbor); - + int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,false, _threads_per_atom); @@ -155,7 +155,7 @@ template inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, + double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success) { success=true; @@ -194,7 +194,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full, zero_timers(); return; } - + int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time); ans->inum(inum); @@ -230,12 +230,12 @@ template int** BaseDipoleT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, + int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, - double *host_q, double **host_mu, + double *host_q, double **host_mu, double *boxlo, double *prd) { acc_timers(); if (inum_full==0) { @@ -245,12 +245,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full, zero_timers(); return NULL; } - + hd_balancer.balance(cpu_time); int inum=hd_balancer.get_gpu_count(ago,inum_full); ans->inum(inum); host_start=inum; - + // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, @@ -279,7 +279,7 @@ int** BaseDipoleT::compute(const int ago, const int inum_full, ans->copy_answers(eflag,vflag,eatom,vatom); device->add_ans_object(ans); hd_balancer.stop_timer(); - + return nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_base_dipole.h b/lib/gpu/lal_base_dipole.h index 2e495c8747..b51c4303cf 100644 --- a/lib/gpu/lal_base_dipole.h +++ b/lib/gpu/lal_base_dipole.h @@ -10,7 +10,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -40,7 +40,7 @@ class BaseDipole { * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * \param k_name name for the kernel for force calculation - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -82,7 +82,7 @@ class BaseDipole { * \note host_inum is 0 if the host is performing neighboring * \note nlocal+host_inum=total number local particles * \note olist_size=0 **/ - inline void resize_local(const int inum, const int host_inum, + inline void resize_local(const int inum, const int host_inum, const int max_nbors, bool &success) { nbor->resize(inum,host_inum,max_nbors,success); } @@ -136,12 +136,12 @@ class BaseDipole { int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double *charge, double **mu, double *boxlo, double *prd); - // -------------------------- DEVICE DATA ------------------------- + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage Device *device; diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp index 0efb68a9fb..941f463b14 100644 --- a/lib/gpu/lal_base_dpd.cpp +++ b/lib/gpu/lal_base_dpd.cpp @@ -64,7 +64,7 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, _nbor_data=&(nbor->dev_packed); } else _nbor_data=&(nbor->dev_nbor); - + int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,false, _threads_per_atom,true); @@ -153,7 +153,7 @@ template inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, + double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success) { success=true; @@ -182,7 +182,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, tagint *tag, double **host_v, + bool &success, tagint *tag, double **host_v, const double dtinvsqrt, const int seed, const int timestep, const int nlocal, double *boxlo, double *prd) { acc_timers(); @@ -193,7 +193,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, zero_timers(); return; } - + int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time); ans->inum(inum); @@ -228,12 +228,12 @@ template int** BaseDPDT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, + int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, - double **host_v, const double dtinvsqrt, + double **host_v, const double dtinvsqrt, const int seed, const int timestep, double *boxlo, double *prd) { acc_timers(); @@ -244,12 +244,12 @@ int** BaseDPDT::compute(const int ago, const int inum_full, zero_timers(); return NULL; } - + hd_balancer.balance(cpu_time); int inum=hd_balancer.get_gpu_count(ago,inum_full); ans->inum(inum); host_start=inum; - + // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, @@ -276,7 +276,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full, ans->copy_answers(eflag,vflag,eatom,vatom); device->add_ans_object(ans); hd_balancer.stop_timer(); - + return nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h index 97640ed40e..7a75282d0a 100644 --- a/lib/gpu/lal_base_dpd.h +++ b/lib/gpu/lal_base_dpd.h @@ -40,7 +40,7 @@ class BaseDPD { * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * \param k_name name for the kernel for force calculation - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -81,7 +81,7 @@ class BaseDPD { * \note host_inum is 0 if the host is performing neighboring * \note nlocal+host_inum=total number local particles * \note olist_size=0 **/ - inline void resize_local(const int inum, const int host_inum, + inline void resize_local(const int inum, const int host_inum, const int max_nbors, bool &success) { nbor->resize(inum,host_inum,max_nbors,success); } @@ -129,20 +129,20 @@ class BaseDPD { int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, tagint *tag, - double **v, const double dtinvsqrt, const int seed, + double **v, const double dtinvsqrt, const int seed, const int timestep, const int nlocal, double *boxlo, double *prd); /// Pair loop with device neighboring int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double **v, const double dtinvsqrt, const int seed, const int timestep, double *boxlo, double *prd); - // -------------------------- DEVICE DATA ------------------------- + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage Device *device; diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp index 4200c02e1c..8918a3140c 100644 --- a/lib/gpu/lal_base_ellipsoid.cpp +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -70,7 +70,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, _gpu_host=1; _threads_per_atom=device->threads_per_atom(); - + int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,true, 1); @@ -113,7 +113,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, return -8; if (_multiple_forms && gpu_nbor!=0) return -9; - + if (_multiple_forms) ans->force.zero(); @@ -142,7 +142,7 @@ void BaseEllipsoidT::clear_base() { // Output any timing information output_times(); host_olist.clear(); - + if (_compiled) { k_nbor_fast.clear(); k_nbor.clear(); @@ -156,7 +156,7 @@ void BaseEllipsoidT::clear_base() { delete lj_program; _compiled=false; } - + time_nbor1.clear(); time_ellipsoid.clear(); time_nbor2.clear(); @@ -230,7 +230,7 @@ void BaseEllipsoidT::output_times() { if (times[6]>0) fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); - fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); + fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size); @@ -241,10 +241,10 @@ void BaseEllipsoidT::output_times() { } // --------------------------------------------------------------------------- -// Pack neighbors to limit thread divergence for lj-lj and ellipse +// Pack neighbors to limit thread divergence for lj-lj and ellipse // --------------------------------------------------------------------------- template -void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, +void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, const int inum, const int form_low, const int form_high, const bool shared_types, int ntypes) { @@ -264,18 +264,18 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, // Copy neighbor list from host // --------------------------------------------------------------------------- template -void BaseEllipsoidT::reset_nbors(const int nall, const int inum, +void BaseEllipsoidT::reset_nbors(const int nall, const int inum, const int osize, int *ilist, int *numj, int *type, int **firstneigh, bool &success) { success=true; - + int mn=nbor->max_nbor_loop(osize,numj,ilist); resize_atom(nall,success); resize_local(inum,0,mn,osize,success); if (!success) return; - + if (_multiple_forms) { int p=0; for (int i=0; i inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, + double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success) { success=true; @@ -354,7 +354,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full, zero_timers(); return NULL; } - + int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time); ans->inum(inum); @@ -394,7 +394,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat) { @@ -410,7 +410,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall ans->inum(inum); _last_ellipse=std::min(inum,_max_last_ellipse); host_start=inum; - + // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, @@ -419,7 +419,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall return NULL; atom->cast_quat_data(host_quat[0]); hd_balancer.start_timer(); - } else { + } else { atom->cast_x_data(host_x,host_type); atom->cast_quat_data(host_quat[0]); hd_balancer.start_timer(); @@ -444,9 +444,9 @@ double BaseEllipsoidT::host_memory_usage_base() const { } template -void BaseEllipsoidT::compile_kernels(UCL_Device &dev, +void BaseEllipsoidT::compile_kernels(UCL_Device &dev, const void *ellipsoid_string, - const void *lj_string, + const void *lj_string, const char *kname, const bool e_s) { if (_compiled) return; diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h index e289430f43..7deeccbf44 100644 --- a/lib/gpu/lal_base_ellipsoid.h +++ b/lib/gpu/lal_base_ellipsoid.h @@ -42,7 +42,7 @@ class BaseEllipsoid { * \param gpu_split fraction of particles handled by device * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately * \param k_name name for the kernel for force calculation - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -68,7 +68,7 @@ class BaseEllipsoid { quat_tex.bind_float(atom->quat,4); lj_pos_tex.bind_float(atom->x,4); lj_quat_tex.bind_float(atom->quat,4); - } + } } /// Check if there is enough storage for neighbors and realloc if not @@ -78,7 +78,7 @@ class BaseEllipsoid { * \param olist_size size of list of particles from CPU neighboring * \note host_inum is 0 if the host is performing neighboring * \note if GPU is neighboring nlocal+host_inum=total number local particles - * \note if CPU is neighboring olist_size=total number of local particles + * \note if CPU is neighboring olist_size=total number of local particles * \note if GPU is neighboring olist_size=0 **/ inline void resize_local(const int nlocal, const int host_inum, const int max_nbors, const int olist_size, @@ -101,7 +101,7 @@ class BaseEllipsoid { /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear_base(); - + /// Output any timing information void output_times(); @@ -130,7 +130,7 @@ class BaseEllipsoid { ans->acc_timers(); } } - + /// Zero timers inline void zero_timers() { time_nbor1.zero(); @@ -148,9 +148,9 @@ class BaseEllipsoid { ans->zero_timers(); } - /// Pack neighbors to limit thread divergence for lj-lj and ellipse + /// Pack neighbors to limit thread divergence for lj-lj and ellipse void pack_nbors(const int GX, const int BX, const int start, const int inum, - const int form_low, const int form_high, + const int form_low, const int form_high, const bool shared_types, int ntypes); /// Copy neighbor list from host @@ -174,17 +174,17 @@ class BaseEllipsoid { int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double **host_quat); /// Build neighbor list on accelerator - void build_nbor_list(const int inum, const int host_inum, const int nall, + void build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, bool &success); - - // -------------------------- DEVICE DATA ------------------------- + + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage Device *device; @@ -207,7 +207,7 @@ class BaseEllipsoid { /// Atom Data Atom *atom; - // --------------------------- TYPE DATA -------------------------- + // --------------------------- TYPE DATA -------------------------- /// cut_form.x = cutsq, cut_form.y = form UCL_D_Vec cut_form; @@ -240,7 +240,7 @@ class BaseEllipsoid { double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; - // True if we want to use fast GB-sphere or sphere-sphere calculations + // True if we want to use fast GB-sphere or sphere-sphere calculations bool _multiple_forms; int **_host_form; int _last_ellipse, _max_last_ellipse; diff --git a/lib/gpu/lal_beck.cpp b/lib/gpu/lal_beck.cpp index 062c095957..165a02b71a 100644 --- a/lib/gpu/lal_beck.cpp +++ b/lib/gpu/lal_beck.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,17 +33,17 @@ BeckT::Beck() : BaseAtomic(), _allocated(false) { } template -BeckT::~Beck() { +BeckT::~Beck() { clear(); } - + template int BeckT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int BeckT::init(const int ntypes, +int BeckT::init(const int ntypes, double **host_cutsq, double **host_aa, double **host_alpha, double **host_beta, double **host_AA, double **host_BB, @@ -126,7 +126,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); diff --git a/lib/gpu/lal_beck.cu b/lib/gpu/lal_beck.cu index 7ccefd8859..7d72128b5f 100644 --- a/lib/gpu/lal_beck.cu +++ b/lib/gpu/lal_beck.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,7 +24,7 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_beck(const __global numtyp4 *restrict x_, +__kernel void k_beck(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict beck1, const __global numtyp4 *restrict beck2, const int lj_types, @@ -50,20 +50,20 @@ __kernel void k_beck(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -133,7 +133,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; @@ -143,7 +143,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_, beck1[tid]=beck1_in[tid]; beck2[tid]=beck2_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -152,7 +152,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_beck.h b/lib/gpu/lal_beck.h index fa56db2402..db26bebeb0 100644 --- a/lib/gpu/lal_beck.h +++ b/lib/gpu/lal_beck.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Beck : public BaseAtomic { public: Beck(); - ~Beck(); + ~Beck(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -41,8 +41,8 @@ class Beck : public BaseAtomic { double **host_aa, double **host_alpha, double **host_beta, double **host_AA, double **host_BB, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -67,7 +67,7 @@ class Beck : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_beck_ext.cpp b/lib/gpu/lal_beck_ext.cpp index 28ca0df346..1552b640e8 100644 --- a/lib/gpu/lal_beck_ext.cpp +++ b/lib/gpu/lal_beck_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -77,7 +77,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa, cell_size, gpu_split, screen); BLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,7 +102,7 @@ int ** beck_gpu_compute_n(const int ago, const int inum_full, return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void beck_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp index 55cb24d3b0..36898b3910 100644 --- a/lib/gpu/lal_born.cpp +++ b/lib/gpu/lal_born.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,10 +33,10 @@ BornT::Born() : BaseAtomic(), _allocated(false) { } template -BornT::~Born() { +BornT::~Born() { clear(); } - + template int BornT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -44,12 +44,12 @@ int BornT::bytes_per_atom(const int max_nbors) const { template int BornT::init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_born1, double **host_born2, + double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, double **host_sigma, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -102,14 +102,14 @@ void BornT::reinit(const int ntypes, double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, double **host_offset) { - + // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv, host_born1,host_born2,host_born3); this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c, @@ -151,7 +151,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -169,7 +169,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) { } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &coeff1, &coeff2, - &cutsq_sigma, &_lj_types, &sp_lj, + &cutsq_sigma, &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu index 5f917be846..0ca7fea5fe 100644 --- a/lib/gpu/lal_born.cu +++ b/lib/gpu/lal_born.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,16 +24,16 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_born(const __global numtyp4 *restrict x_, +__kernel void k_born(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, - const __global numtyp4 *restrict coeff2, + const __global numtyp4 *restrict coeff2, const __global numtyp2 *restrict cutsq_sigma, - const int lj_types, - const __global numtyp *restrict sp_lj_in, + const int lj_types, + const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -51,20 +51,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + coeff2[mtype].z*r2inv*r6inv; - energy+=factor_lj*(e-coeff2[mtype].w); + energy+=factor_lj*(e-coeff2[mtype].w); } if (vflag>0) { virial[0] += delx*delx*force; @@ -113,20 +113,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_born_fast(const __global numtyp4 *restrict x_, +__kernel void k_born_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1_in, - const __global numtyp4 *restrict coeff2_in, + const __global numtyp4 *restrict coeff2_in, const __global numtyp2 *restrict cutsq_sigma, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; @@ -137,7 +137,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_, if (eflag>0) coeff2[tid]=coeff2_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -146,7 +146,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + coeff2[mtype].z*r2inv*r6inv; - energy+=factor_lj*(e-coeff2[mtype].w); + energy+=factor_lj*(e-coeff2[mtype].w); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_born.h b/lib/gpu/lal_born.h index 6fed6461d2..685f4d87a9 100644 --- a/lib/gpu/lal_born.h +++ b/lib/gpu/lal_born.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Born : public BaseAtomic { public: Born(); - ~Born(); + ~Born(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -38,20 +38,20 @@ class Born : public BaseAtomic { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_born1, double **host_born2, + double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, - double **host_d, double **host_sigma, + double **host_d, double **host_sigma, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, double **host_offset); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -77,7 +77,7 @@ class Born : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_born_coul_long.cpp b/lib/gpu/lal_born_coul_long.cpp index 94becf8c69..242961e80c 100644 --- a/lib/gpu/lal_born_coul_long.cpp +++ b/lib/gpu/lal_born_coul_long.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -37,17 +37,17 @@ template BornCoulLongT::~BornCoulLongT() { clear(); } - + template int BornCoulLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, double **host_born3, - double **host_a, double **host_c, double **host_d, - double **host_sigma, double **host_offset, +int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, + double **host_sigma, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -85,11 +85,11 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, host_d,host_offset); - + cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq, host_cut_ljsq,host_sigma); - + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_lj[i]; @@ -142,7 +142,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -157,15 +157,15 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) { &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, - &cutsq_sigma, &_cut_coulsq, &_qqrd2e, + &cutsq_sigma, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, + this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->atom->q, + &nbor_pitch, &this->atom->q, &cutsq_sigma, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu index 3d74f2087a..4cb4ea448f 100644 --- a/lib/gpu/lal_born_coul_long.cu +++ b/lib/gpu/lal_born_coul_long.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -29,19 +29,19 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_born_long(const __global numtyp4 *restrict x_, +__kernel void k_born_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, const __global numtyp4 *restrict coeff2, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, - const __global numtyp4 *restrict cutsq_sigma, + const __global numtyp4 *restrict cutsq_sigma, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; @@ -64,14 +64,14 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { - if (rsq < cut_coulsq) - e_coul += prefactor*(_erfc-factor_coul); - if (rsq < coeff1[mtype].w) { - numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv - + coeff2[mtype].z*r2inv*r6inv; - energy+=factor_lj*(e-coeff2[mtype].w); - } - } - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } - } - - } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); - } // if ii -} - -__kernel void k_born_long_fast(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict coeff1_in, - const __global numtyp4 *restrict coeff2_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, - const __global numtyp *restrict q_, - const __global numtyp4 *restrict cutsq_sigma, - const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp g_ewald, const int t_per_atom) { - int tid, ii, offset; - atom_info(t_per_atom,ii,tid,offset); - - __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __local numtyp sp_lj[8]; - if (tid<8) - sp_lj[tid]=sp_lj_in[tid]; - if (tid0) - coeff2[tid]=coeff2_in[tid]; - } - - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; - acctyp4 f; - f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; - - __syncthreads(); - - if (ii0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < coeff1[mtype].w) { + numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + + coeff2[mtype].z*r2inv*r6inv; + energy+=factor_lj*(e-coeff2[mtype].w); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void k_born_long_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict coeff1_in, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp4 *restrict cutsq_sigma, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + coeff2[tid]=coeff2_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, double **host_cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, double **host_born3, - double **host_a, double **host_c, double **host_d, + int init(const int ntypes, double **host_cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, double **host_sigma, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); @@ -59,12 +59,12 @@ class BornCoulLong : public BaseCharge { // --------------------------- TYPE DATA -------------------------- - /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, + /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, /// coeff1.w = born3 UCL_D_Vec coeff1; /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset UCL_D_Vec coeff2; - /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, + /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, /// cutsq_sigma.z = sigma UCL_D_Vec cutsq_sigma; /// Special LJ values [0-3] and Special Coul values [4-7] @@ -73,7 +73,7 @@ class BornCoulLong : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e, _g_ewald; diff --git a/lib/gpu/lal_born_coul_long_ext.cpp b/lib/gpu/lal_born_coul_long_ext.cpp index 382e9a2b2c..8c1ff0413f 100644 --- a/lib/gpu/lal_born_coul_long_ext.cpp +++ b/lib/gpu/lal_born_coul_long_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -30,9 +30,9 @@ static BornCoulLong BORNCLMF; int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, - double **sigma, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, + double **sigma, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { @@ -58,10 +58,10 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) - init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, - host_born3, host_a, host_c, host_d, sigma, offset, - special_lj, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_cut_ljsq, host_cut_coulsq, + init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + host_born3, host_a, host_c, host_d, sigma, offset, + special_lj, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); BORNCLMF.device->world_barrier(); @@ -78,14 +78,14 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, - host_born3, host_a, host_c, host_d, sigma, offset, - special_lj, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_cut_ljsq, host_cut_coulsq, + init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + host_born3, host_a, host_c, host_d, sigma, offset, + special_lj, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); BORNCLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,7 +102,7 @@ void borncl_gpu_clear() { int** borncl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -112,7 +112,7 @@ int** borncl_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void borncl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp index 7615c1dd53..fa832206ee 100644 --- a/lib/gpu/lal_born_coul_wolf.cpp +++ b/lib/gpu/lal_born_coul_wolf.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -37,17 +37,17 @@ template BornCoulWolfT::~BornCoulWolfT() { clear(); } - + template int BornCoulWolfT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, double **host_born3, - double **host_a, double **host_c, double **host_d, - double **host_sigma, double **host_offset, +int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, + double **host_sigma, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -85,11 +85,11 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, host_d,host_offset); - + cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq, host_cut_ljsq,host_sigma); - + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_lj[i]; @@ -144,7 +144,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -157,17 +157,17 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) { &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, - &cutsq_sigma, &_cut_coulsq, &_qqrd2e, - &_alf, &_e_shift, &_f_shift, + &cutsq_sigma, &_cut_coulsq, &_qqrd2e, + &_alf, &_e_shift, &_f_shift, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq_sigma, &_cut_coulsq, - &_qqrd2e, &_alf, &_e_shift, &_f_shift, + &_qqrd2e, &_alf, &_e_shift, &_f_shift, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu index e7706b408a..0dc7d08c63 100644 --- a/lib/gpu/lal_born_coul_wolf.cu +++ b/lib/gpu/lal_born_coul_wolf.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -31,21 +31,21 @@ texture q_tex; #define MY_PIS (acctyp)1.77245385090551602729 -__kernel void k_born_wolf(const __global numtyp4 *restrict x_, +__kernel void k_born_wolf(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, - const __global numtyp4 *restrict coeff2, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, - const __global numtyp4 *restrict cutsq_sigma, + const __global numtyp4 *restrict cutsq_sigma, const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp alf, const numtyp e_shift, + const numtyp alf, const numtyp e_shift, const numtyp f_shift, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -67,20 +67,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * + acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } @@ -108,12 +108,12 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_, numtyp forcecoul, forceborn, force, r6inv, prefactor; numtyp v_sh = (numtyp)0.0; numtyp rexp = (numtyp)0.0; - + if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq numtyp r = ucl_sqrt(rsq); rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x); r6inv = r2inv*r2inv*r2inv; - forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv + forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv)*factor_lj; } else forceborn = (numtyp)0.0; @@ -147,7 +147,7 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_, numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + coeff2[mtype].z*r2inv*r6inv; energy+=factor_lj*(e-coeff2[mtype].w); - } + } } if (vflag>0) { virial[0] += delx*delx*force; @@ -165,20 +165,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, +__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1_in, - const __global numtyp4 *restrict coeff2_in, + const __global numtyp4 *restrict coeff2_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const __global numtyp *restrict q_, const __global numtyp4 *restrict cutsq_sigma, const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp alf, const numtyp e_shift, + const numtyp alf, const numtyp e_shift, const numtyp f_shift, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -193,7 +193,7 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, if (eflag>0) coeff2[tid]=coeff2_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -201,23 +201,23 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * + acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } @@ -244,12 +244,12 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, numtyp forcecoul, forceborn, force, r6inv, prefactor; numtyp v_sh = (numtyp)0.0; numtyp rexp = (numtyp)0.0; - + if (rsq < cutsq_sigma[mtype].y) { numtyp r = ucl_sqrt(rsq); rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x); r6inv = r2inv*r2inv*r2inv; - forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv + forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv)*factor_lj; } else forceborn = (numtyp)0.0; diff --git a/lib/gpu/lal_born_coul_wolf.h b/lib/gpu/lal_born_coul_wolf.h index 9e02d23233..4b2406b989 100644 --- a/lib/gpu/lal_born_coul_wolf.h +++ b/lib/gpu/lal_born_coul_wolf.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -30,19 +30,19 @@ class BornCoulWolf : public BaseCharge { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, double **host_cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, double **host_born3, - double **host_a, double **host_c, double **host_d, + int init(const int ntypes, double **host_cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, double **host_sigma, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double alf, const double e_shift, @@ -60,12 +60,12 @@ class BornCoulWolf : public BaseCharge { // --------------------------- TYPE DATA -------------------------- - /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, + /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, /// coeff1.w = born3 UCL_D_Vec coeff1; /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset UCL_D_Vec coeff2; - /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, + /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, /// cutsq_sigma.z = sigma UCL_D_Vec cutsq_sigma; /// Special LJ values [0-3] and Special Coul values [4-7] @@ -74,7 +74,7 @@ class BornCoulWolf : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift; diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp index b56c526119..5083afe0c4 100644 --- a/lib/gpu/lal_born_coul_wolf_ext.cpp +++ b/lib/gpu/lal_born_coul_wolf_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -28,7 +28,7 @@ static BornCoulWolf BORNCWMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, double **host_born3, + double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, double **sigma, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, @@ -60,9 +60,9 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, if (world_me==0) init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, - offset, special_lj, inum, nall, 300, + offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, - host_cut_coulsq, host_special_coul, qqrd2e, + host_cut_coulsq, host_special_coul, qqrd2e, alf, e_shift, f_shift); BORNCWMF.device->world_barrier(); @@ -79,15 +79,15 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, - host_born3, host_a, host_c, host_d, sigma, - offset, special_lj, inum, nall, 300, + init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + host_born3, host_a, host_c, host_d, sigma, + offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, - host_cut_coulsq, host_special_coul, qqrd2e, + host_cut_coulsq, host_special_coul, qqrd2e, alf, e_shift, f_shift); BORNCWMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -104,7 +104,7 @@ void borncw_gpu_clear() { int** borncw_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -114,7 +114,7 @@ int** borncw_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void borncw_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp index 6bd51e6d68..171020e769 100644 --- a/lib/gpu/lal_born_ext.cpp +++ b/lib/gpu/lal_born_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -28,9 +28,9 @@ static Born BORNMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, - double **host_born3, double **host_a, double **host_c, - double **host_d, double **sigma, + double **host_born1, double **host_born2, + double **host_born3, double **host_a, double **host_c, + double **host_d, double **sigma, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { @@ -56,7 +56,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) - init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -75,13 +75,13 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, - host_born3, host_a, host_c, host_d, sigma, + init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + host_born3, host_a, host_c, host_d, sigma, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); BORNMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,24 +102,24 @@ void born_gpu_reinit(const int ntypes, double **host_rhoinv, int world_me=BORNMF.device->world_me(); int gpu_rank=BORNMF.device->gpu_rank(); int procs_per_gpu=BORNMF.device->procs_per_gpu(); - + if (world_me==0) BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, offset); - + BORNMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } } void born_gpu_clear() { - BORNMF.clear(); + BORNMF.clear(); } int ** born_gpu_compute_n(const int ago, const int inum_full, @@ -132,7 +132,7 @@ int ** born_gpu_compute_n(const int ago, const int inum_full, return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void born_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp index f66759ee3a..aa82f0014d 100644 --- a/lib/gpu/lal_buck.cpp +++ b/lib/gpu/lal_buck.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,10 +33,10 @@ BuckT::Buck() : BaseAtomic(), _allocated(false) { } template -BuckT::~Buck() { +BuckT::~Buck() { clear(); } - + template int BuckT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -44,11 +44,11 @@ int BuckT::bytes_per_atom(const int max_nbors) const { template int BuckT::init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -95,14 +95,14 @@ template void BuckT::reinit(const int ntypes, double **host_cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **host_offset) { - + // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv, host_buck1,host_buck2,host_cutsq); this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c, @@ -143,7 +143,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -154,13 +154,13 @@ void BuckT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu index 955547e598..c1e1c7d7e2 100644 --- a/lib/gpu/lal_buck.cu +++ b/lib/gpu/lal_buck.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,15 +24,15 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_buck(const __global numtyp4 *restrict x_, +__kernel void k_buck(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, - const __global numtyp4 *restrict coeff2, - const int lj_types, + const __global numtyp4 *restrict coeff2, + const int lj_types, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -50,20 +50,20 @@ __kernel void k_buck(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; - energy+=factor_lj*(e-coeff2[mtype].z); + energy+=factor_lj*(e-coeff2[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; @@ -111,19 +111,19 @@ __kernel void k_buck(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_buck_fast(const __global numtyp4 *restrict x_, +__kernel void k_buck_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1_in, - const __global numtyp4 *restrict coeff2_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; @@ -134,7 +134,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_, if (eflag>0) coeff2[tid]=coeff2_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -143,7 +143,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; - energy+=factor_lj*(e-coeff2[mtype].z); + energy+=factor_lj*(e-coeff2[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_buck.h b/lib/gpu/lal_buck.h index ebcd72d990..3b84066355 100644 --- a/lib/gpu/lal_buck.h +++ b/lib/gpu/lal_buck.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Buck : public BaseAtomic { public: Buck(); - ~Buck(); + ~Buck(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -38,18 +38,18 @@ class Buck : public BaseAtomic { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **host_offset); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -72,7 +72,7 @@ class Buck : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_buck_coul.cpp b/lib/gpu/lal_buck_coul.cpp index bec640e7a6..9de019d871 100644 --- a/lib/gpu/lal_buck_coul.cpp +++ b/lib/gpu/lal_buck_coul.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,10 +33,10 @@ BuckCoulT::BuckCoul() : BaseCharge(), _allocated(false) { } template -BuckCoulT::~BuckCoul() { +BuckCoulT::~BuckCoul() { clear(); } - + template int BuckCoulT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -44,11 +44,11 @@ int BuckCoulT::bytes_per_atom(const int max_nbors) const { template int BuckCoulT::init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e) { @@ -82,20 +82,20 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq, coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, host_offset); - + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq, host_cut_ljsq, host_cut_coulsq); - + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_lj[i]; host_write[i+4]=host_special_coul[i]; } ucl_copy(sp_lj,host_write,8,false); - + _qqrd2e = qqrd2e; - + _allocated=true; this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes(); return 0; @@ -135,7 +135,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -147,12 +147,12 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, &this->atom->q, + &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &this->_threads_per_atom); diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu index 87604a02ea..6f0d414825 100644 --- a/lib/gpu/lal_buck_coul.cu +++ b/lib/gpu/lal_buck_coul.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -29,19 +29,19 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_buck_coul(const __global numtyp4 *restrict x_, +__kernel void k_buck_coul(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, - const __global numtyp4 *restrict coeff2, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_ , - const __global numtyp4 *restrict cutsq, + const __global numtyp4 *restrict cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -63,21 +63,21 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) coeff2[tid]=coeff2_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -180,7 +180,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_buck_coul.h b/lib/gpu/lal_buck_coul.h index e4bf59107c..3f8428bfe1 100644 --- a/lib/gpu/lal_buck_coul.h +++ b/lib/gpu/lal_buck_coul.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class BuckCoul : public BaseCharge { public: BuckCoul(); - ~BuckCoul(); + ~BuckCoul(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -38,11 +38,11 @@ class BuckCoul : public BaseCharge { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e); @@ -71,11 +71,11 @@ class BuckCoul : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; - + numtyp _qqrd2e; - + private: bool _allocated; void loop(const bool _eflag, const bool _vflag); diff --git a/lib/gpu/lal_buck_coul_ext.cpp b/lib/gpu/lal_buck_coul_ext.cpp index dd696fc6bb..3335f4ba47 100644 --- a/lib/gpu/lal_buck_coul_ext.cpp +++ b/lib/gpu/lal_buck_coul_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -28,8 +28,8 @@ static BuckCoul BUCKCMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, @@ -57,9 +57,9 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) - init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); @@ -77,14 +77,14 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); BUCKCMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -96,12 +96,12 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, } void buckc_gpu_clear() { - BUCKCMF.clear(); + BUCKCMF.clear(); } int ** buckc_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -111,7 +111,7 @@ int ** buckc_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void buckc_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp index 4aa720132a..bf9b5fb101 100644 --- a/lib/gpu/lal_buck_coul_long.cpp +++ b/lib/gpu/lal_buck_coul_long.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template BuckCoulLongT::~BuckCoulLongT() { clear(); } - + template int BuckCoulLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,8 +45,8 @@ int BuckCoulLongT::bytes_per_atom(const int max_nbors) const { template int BuckCoulLongT::init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_buck1, double **host_buck2, - double **host_a, double **host_c, double **host_offset, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -84,10 +84,10 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq, coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, host_offset); - + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); - + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_lj[i]; @@ -139,7 +139,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -150,16 +150,16 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, - &cutsq, &_cut_coulsq, &_qqrd2e, + &cutsq, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->q, &cutsq, + &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu index fc68d12471..da3237a31f 100644 --- a/lib/gpu/lal_buck_coul_long.cu +++ b/lib/gpu/lal_buck_coul_long.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -29,19 +29,19 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, +__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, - const __global numtyp4 *restrict coeff2, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, - const __global numtyp *restrict cutsq, + const __global numtyp *restrict cutsq, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; @@ -64,14 +64,14 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { - if (rsq < cut_coulsq) - e_coul += prefactor*(_erfc-factor_coul); - if (rsq < coeff1[mtype].w) { - numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; - energy+=factor_lj*(e-coeff2[mtype].z); - } - } - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } - } - - } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); - } // if ii -} - -__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict coeff1_in, - const __global numtyp4 *restrict coeff2_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, - const __global numtyp *restrict q_, - const __global numtyp *restrict cutsq, - const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald, - const int t_per_atom) { - int tid, ii, offset; - atom_info(t_per_atom,ii,tid,offset); - - __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __local numtyp sp_lj[8]; - if (tid<8) - sp_lj[tid]=sp_lj_in[tid]; - if (tid0) - coeff2[tid]=coeff2_in[tid]; - } - - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; - acctyp4 f; - f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; - - __syncthreads(); - - if (ii0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < coeff1[mtype].w) { + numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; + energy+=factor_lj*(e-coeff2[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict coeff1_in, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp *restrict cutsq, + const numtyp cut_coulsq, + const numtyp qqrd2e, const numtyp g_ewald, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + coeff2[tid]=coeff2_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -38,11 +38,11 @@ class BuckCoulLong : public BaseCharge { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); @@ -71,7 +71,7 @@ class BuckCoulLong : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e, _g_ewald; diff --git a/lib/gpu/lal_buck_coul_long_ext.cpp b/lib/gpu/lal_buck_coul_long_ext.cpp index 9c0c331ee1..51e0d233d3 100644 --- a/lib/gpu/lal_buck_coul_long_ext.cpp +++ b/lib/gpu/lal_buck_coul_long_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -28,7 +28,7 @@ static BuckCoulLong BUCKCLMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_buck1, double **host_buck2, + double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, @@ -58,8 +58,8 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) - init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, - host_a, host_c, offset, special_lj, inum, nall, 300, + init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); @@ -77,13 +77,13 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, - host_a, host_c, offset, special_lj, inum, nall, 300, + init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); BUCKCLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -100,7 +100,7 @@ void buckcl_gpu_clear() { int** buckcl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -110,7 +110,7 @@ int** buckcl_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void buckcl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp index 75c88e8dbe..36a780426c 100644 --- a/lib/gpu/lal_buck_ext.cpp +++ b/lib/gpu/lal_buck_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -28,8 +28,8 @@ static Buck BUCKMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { @@ -55,7 +55,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) - init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -73,12 +73,12 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); BUCKMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -98,24 +98,24 @@ void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv, int world_me=BUCKMF.device->world_me(); int gpu_rank=BUCKMF.device->gpu_rank(); int procs_per_gpu=BUCKMF.device->procs_per_gpu(); - + if (world_me==0) BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset); - + BUCKMF.device->world_barrier(); for (int i=0; igpu_barrier(); } } void buck_gpu_clear() { - BUCKMF.clear(); + BUCKMF.clear(); } int ** buck_gpu_compute_n(const int ago, const int inum_full, @@ -128,7 +128,7 @@ int ** buck_gpu_compute_n(const int ago, const int inum_full, return BUCKMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void buck_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_cg_cmm.cpp b/lib/gpu/lal_cg_cmm.cpp index 96455888f0..11974e05e0 100644 --- a/lib/gpu/lal_cg_cmm.cpp +++ b/lib/gpu/lal_cg_cmm.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -33,23 +33,23 @@ CGCMMT::CGCMM() : BaseAtomic(), _allocated(false) { } template -CGCMMT::~CGCMM() { +CGCMMT::~CGCMM() { clear(); } - + template int CGCMMT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int CGCMMT::init(const int ntypes, double **host_cutsq, - int **host_cg_type, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, +int CGCMMT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -75,7 +75,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq, host_write[i]=0.0; lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY); - this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, + this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, host_cg_type,host_lj1,host_lj2); lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY); @@ -126,7 +126,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -138,7 +138,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); diff --git a/lib/gpu/lal_cg_cmm.cu b/lib/gpu/lal_cg_cmm.cu index 8f89f74d22..70d2ab6092 100644 --- a/lib/gpu/lal_cg_cmm.cu +++ b/lib/gpu/lal_cg_cmm.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -24,15 +24,15 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_cg_cmm(const __global numtyp4 *restrict x_, +__kernel void k_cg_cmm(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, + const __global numtyp4 *restrict lj3, + const int lj_types, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -50,20 +50,20 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii class CGCMM : public BaseAtomic { public: CGCMM(); - ~CGCMM(); + ~CGCMM(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,7 +40,7 @@ class CGCMM : public BaseAtomic { int init(const int ntypes, double **host_cutsq, int **host_cg_type, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); @@ -66,7 +66,7 @@ class CGCMM : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _cmm_types; private: diff --git a/lib/gpu/lal_cg_cmm_ext.cpp b/lib/gpu/lal_cg_cmm_ext.cpp index 0d2c3d8fbf..2a00271736 100644 --- a/lib/gpu/lal_cg_cmm_ext.cpp +++ b/lib/gpu/lal_cg_cmm_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -28,9 +28,9 @@ static CGCMM CMMMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, - double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { CMMMF.clear(); @@ -55,7 +55,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, int init_ok=0; if (world_me==0) - init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, + init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -78,7 +78,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, maxspecial, cell_size, gpu_split, screen); CMMMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -103,7 +103,7 @@ int** cmm_gpu_compute_n(const int ago, const int inum_full, return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void cmm_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_cg_cmm_long.cpp b/lib/gpu/lal_cg_cmm_long.cpp index 92e6bd04b5..14b5b7622c 100644 --- a/lib/gpu/lal_cg_cmm_long.cpp +++ b/lib/gpu/lal_cg_cmm_long.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -37,22 +37,22 @@ template CGCMMLongT::~CGCMMLong() { clear(); } - + template int CGCMMLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int CGCMMLongT::init(const int ntypes, double **host_cutsq, - int **host_cg_type, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, +int CGCMMLongT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, - double **host_cut_ljsq, + double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { @@ -137,7 +137,7 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -149,13 +149,13 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, &this->atom->q, - &_cut_coulsq, &_qqrd2e, &_g_ewald, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); diff --git a/lib/gpu/lal_cg_cmm_long.cu b/lib/gpu/lal_cg_cmm_long.cu index ae8b6cda47..f6942d1809 100644 --- a/lib/gpu/lal_cg_cmm_long.cu +++ b/lib/gpu/lal_cg_cmm_long.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -29,12 +29,12 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, +__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, + const __global numtyp4 *restrict lj3, + const int lj_types, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, @@ -70,7 +70,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp qtmp; fetch(qtmp,i,q_tex); int itype=ix.w; @@ -136,7 +136,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, if (rsq < lj1[mtype].y) { energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)- lj3[mtype].w; - } + } } if (vflag>0) { virial[0] += delx*delx*force; @@ -154,17 +154,17 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, +__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global numtyp4 *restrict lj3_in, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, - const __global numtyp *restrict q_, + const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; @@ -179,7 +179,7 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, lj1[tid]=lj1_in[tid]; lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -187,16 +187,16 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_cg_cmm_long.h b/lib/gpu/lal_cg_cmm_long.h index bde5c79c74..aa0cbfbaf0 100644 --- a/lib/gpu/lal_cg_cmm_long.h +++ b/lib/gpu/lal_cg_cmm_long.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ class CGCMMLong : public BaseCharge { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,8 +40,8 @@ class CGCMMLong : public BaseCharge { int init(const int ntypes, double **host_cutsq, int ** cg_type, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); @@ -58,7 +58,7 @@ class CGCMMLong : public BaseCharge { // --------------------------- TYPE DATA -------------------------- - /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, + /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, UCL_D_Vec lj1; /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset UCL_D_Vec lj3; @@ -68,7 +68,7 @@ class CGCMMLong : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e, _g_ewald; diff --git a/lib/gpu/lal_cg_cmm_long_ext.cpp b/lib/gpu/lal_cg_cmm_long_ext.cpp index 966588bf9b..2fa3f2aead 100644 --- a/lib/gpu/lal_cg_cmm_long_ext.cpp +++ b/lib/gpu/lal_cg_cmm_long_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -28,9 +28,9 @@ static CGCMMLong CMMLMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, - double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, @@ -58,7 +58,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, int init_ok=0; if (world_me==0) init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e,g_ewald); @@ -82,7 +82,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); CMMLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -99,7 +99,7 @@ void cmml_gpu_clear() { int** cmml_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -109,7 +109,7 @@ int** cmml_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q,boxlo,prd); -} +} void cmml_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp index 157072dc22..9cd032b3c6 100644 --- a/lib/gpu/lal_charmm_long.cpp +++ b/lib/gpu/lal_charmm_long.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template CHARMMLongT::~CHARMMLong() { clear(); } - + template int CHARMMLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,9 +45,9 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const { template int CHARMMLongT::init(const int ntypes, - double host_cut_bothsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, + double host_cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -144,7 +144,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -153,17 +153,17 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, + this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, - &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, + &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu index dde50da300..244131f833 100644 --- a/lib/gpu/lal_charmm_long.cu +++ b/lib/gpu/lal_charmm_long.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -31,14 +31,14 @@ texture q_tex; __kernel void k_charmm_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const int lj_types, + const int lj_types, const __global numtyp *restrict sp_lj, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const numtyp denom_lj, @@ -61,7 +61,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp qtmp; fetch(qtmp,i,q_tex); int itype=ix.w; @@ -93,7 +93,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); if (rsq > cut_lj_innersq) { switch1 = (cut_ljsq-rsq); - numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ + numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ denom_lj; switch1 *= switch1; switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/ @@ -130,7 +130,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, if (rsq > cut_lj_innersq) e *= switch1; energy+=factor_lj*e; - } + } } if (vflag>0) { virial[0] += delx*delx*force; @@ -148,19 +148,19 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, +__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, const __global numtyp2 *restrict ljd_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const numtyp denom_lj, - const numtyp cut_bothsq, const numtyp cut_ljsq, + const numtyp cut_bothsq, const numtyp cut_ljsq, const numtyp cut_lj_innersq, const int t_per_atom) { int tid, ii, offset; @@ -174,7 +174,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, ljd[tid]=ljd_in[tid]; if (tid+BLOCK_BIO_PAIR cut_lj_innersq) { switch1 = (cut_ljsq-rsq); - numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ + numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ denom_lj; switch1 *= switch1; switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/ diff --git a/lib/gpu/lal_charmm_long.h b/lib/gpu/lal_charmm_long.h index 201a5c3694..011083db13 100644 --- a/lib/gpu/lal_charmm_long.h +++ b/lib/gpu/lal_charmm_long.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ class CHARMMLong : public BaseCharge { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,12 +40,12 @@ class CHARMMLong : public BaseCharge { int init(const int ntypes, double host_cut_bothsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald, - const double cut_lj_innersq, const double denom_lj, + const double cut_lj_innersq, const double denom_lj, double **epsilon, double **sigma, const bool mix_arithmetic); /// Clear all host and device data @@ -70,7 +70,7 @@ class CHARMMLong : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e, _g_ewald, _denom_lj; diff --git a/lib/gpu/lal_charmm_long_ext.cpp b/lib/gpu/lal_charmm_long_ext.cpp index 807988a3e8..3f7445f306 100644 --- a/lib/gpu/lal_charmm_long_ext.cpp +++ b/lib/gpu/lal_charmm_long_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -87,7 +87,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, sigma, mix_arithmetic); CRMLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -104,7 +104,7 @@ void crml_gpu_clear() { int** crml_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -114,14 +114,14 @@ int** crml_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void crml_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q, const int nlocal, + bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q, diff --git a/lib/gpu/lal_colloid.cpp b/lib/gpu/lal_colloid.cpp index 28045217d3..fb2b643e5e 100644 --- a/lib/gpu/lal_colloid.cpp +++ b/lib/gpu/lal_colloid.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,23 +33,23 @@ ColloidT::Colloid() : BaseAtomic(), _allocated(false) { } template -ColloidT::~Colloid() { +ColloidT::~Colloid() { clear(); } - + template int ColloidT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int ColloidT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, double **host_a12, - double **host_a1, double **host_a2, - double **host_d1, double **host_d2, +int ColloidT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, double **host_a12, + double **host_a1, double **host_a2, + double **host_d1, double **host_d2, double **host_sigma3, double **host_sigma6, int **host_form, const int nlocal, const int nall, const int max_nbors, @@ -97,7 +97,7 @@ int ColloidT::init(const int ntypes, UCL_H_Vec dview_form(lj_types*lj_types,*(this->ucl_device), UCL_WRITE_ONLY); for (int i=0; iucl_device),UCL_READ_ONLY); for (int i=0; i(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -170,9 +170,9 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) { } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &colloid1, &colloid2, &form, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, &vflag, + &colloid1, &colloid2, &form, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu index a4d6c8bf33..89ba71deef 100644 --- a/lib/gpu/lal_colloid.cu +++ b/lib/gpu/lal_colloid.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,18 +24,18 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_colloid(const __global numtyp4 *restrict x_, +__kernel void k_colloid(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global numtyp4 *restrict colloid1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global numtyp4 *restrict colloid1, const __global numtyp4 *restrict colloid2, - const __global int *form, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *form, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -53,20 +53,20 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -176,22 +176,22 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_colloid_fast(const __global numtyp4 *restrict x_, +__kernel void k_colloid_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, - const __global numtyp4 *restrict colloid1_in, + const __global numtyp4 *restrict colloid1_in, const __global numtyp4 *restrict colloid2_in, - const __global int *form_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const __global int *form_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; @@ -208,7 +208,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -217,7 +217,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_colloid.h b/lib/gpu/lal_colloid.h index 416beabcdf..dfbd4dbadd 100644 --- a/lib/gpu/lal_colloid.h +++ b/lib/gpu/lal_colloid.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Colloid : public BaseAtomic { public: Colloid(); - ~Colloid(); + ~Colloid(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,11 +40,11 @@ class Colloid : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - double **host_a12, double **host_a1, double **host_a2, - double **host_d1, double **host_d2, double **host_sigma3, - double **host_sigma6, int **host_form, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + double **host_a12, double **host_a1, double **host_a2, + double **host_d1, double **host_d2, double **host_sigma3, + double **host_sigma6, int **host_form, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -65,7 +65,7 @@ class Colloid : public BaseAtomic { UCL_D_Vec lj3; /// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2 UCL_D_Vec colloid1; - /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3, + /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3, /// colloid2.w = sigma6 UCL_D_Vec colloid2; /// form @@ -76,7 +76,7 @@ class Colloid : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_colloid_ext.cpp b/lib/gpu/lal_colloid_ext.cpp index ea83cb6417..f88ced8443 100644 --- a/lib/gpu/lal_colloid_ext.cpp +++ b/lib/gpu/lal_colloid_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -29,9 +29,9 @@ static Colloid COLLMF; // --------------------------------------------------------------------------- int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, - double **host_a12, double **host_a1, double **host_a2, - double **host_d1, double **host_d2, double **host_sigma3, + double **offset, double *special_lj, + double **host_a12, double **host_a1, double **host_a2, + double **host_d1, double **host_d2, double **host_sigma3, double **host_sigma6, int **host_form, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { @@ -57,9 +57,9 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) - init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, host_a12, host_a1, - host_a2, host_d1, host_d2, host_sigma3, + host_a2, host_d1, host_d2, host_sigma3, host_sigma6, host_form, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -78,13 +78,13 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, host_a12, host_a1, host_a2, - host_d1, host_d2, host_sigma3, host_sigma6, host_form, + offset, special_lj, host_a12, host_a1, host_a2, + host_d1, host_d2, host_sigma3, host_sigma6, host_form, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); COLLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -109,7 +109,7 @@ int ** colloid_gpu_compute_n(const int ago, const int inum_full, return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void colloid_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_coul.cpp b/lib/gpu/lal_coul.cpp index 53fb3dae82..a06a29e610 100644 --- a/lib/gpu/lal_coul.cpp +++ b/lib/gpu/lal_coul.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndtrung@umich.edu ***************************************************************************/ @@ -37,7 +37,7 @@ template CoulT::~Coul() { clear(); } - + template int CoulT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -75,7 +75,7 @@ int CoulT::init(const int ntypes, double **host_scale, double **host_cutsq, scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale); - + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); @@ -97,10 +97,10 @@ void CoulT::reinit(const int ntypes, double **host_scale) { // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale); } @@ -138,7 +138,7 @@ void CoulT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -149,14 +149,14 @@ void CoulT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_coul.cu b/lib/gpu/lal_coul.cu index e955922a7c..503e674c81 100644 --- a/lib/gpu/lal_coul.cu +++ b/lib/gpu/lal_coul.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : ndtrung@umich.edu // ***************************************************************************/ @@ -33,14 +33,14 @@ __kernel void k_coul(const __global numtyp4 *restrict x_, const __global numtyp *restrict scale, const int lj_types, const __global numtyp *restrict sp_cl_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, - const __global numtyp *restrict q_, - const __global numtyp *restrict cutsq, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp *restrict cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -50,7 +50,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_, sp_cl[1]=sp_cl_in[1]; sp_cl[2]=sp_cl_in[2]; sp_cl[3]=sp_cl_in[3]; - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -58,13 +58,13 @@ __kernel void k_coul(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -39,13 +39,13 @@ class Coul : public BaseCharge { * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_scale, double **host_cutsq, double *host_special_coul, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const double qqrd2e); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_scale); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -68,7 +68,7 @@ class Coul : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e; diff --git a/lib/gpu/lal_coul_debye.cpp b/lib/gpu/lal_coul_debye.cpp index 990dff6db9..9098aeacb1 100644 --- a/lib/gpu/lal_coul_debye.cpp +++ b/lib/gpu/lal_coul_debye.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndtrung@umich.edu ***************************************************************************/ @@ -37,7 +37,7 @@ template CoulDebyeT::~CoulDebye() { clear(); } - + template int CoulDebyeT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -87,7 +87,7 @@ int CoulDebyeT::init(const int ntypes, double **host_scale, _qqrd2e=qqrd2e; _kappa=kappa; - + _allocated=true; this->_max_bytes=cutsq.row_bytes()+scale.row_bytes()+sp_cl.row_bytes(); return 0; @@ -98,10 +98,10 @@ void CoulDebyeT::reinit(const int ntypes, double **host_scale) { // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale); } @@ -139,7 +139,7 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -156,9 +156,9 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) { } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->q, &cutsq, + &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &_kappa, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_coul_debye.cu b/lib/gpu/lal_coul_debye.cu index 0e4c0ea2d0..464a1b18de 100644 --- a/lib/gpu/lal_coul_debye.cu +++ b/lib/gpu/lal_coul_debye.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : ndtrung@umich.edu // ***************************************************************************/ @@ -31,16 +31,16 @@ texture q_tex; __kernel void k_coul_debye(const __global numtyp4 *restrict x_, const __global numtyp *restrict scale, - const int lj_types, + const int lj_types, const __global numtyp *restrict sp_cl_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp *restrict q_ , - const __global numtyp *restrict cutsq, + const __global numtyp *restrict cutsq, const numtyp qqrd2e, const numtyp kappa, const int t_per_atom) { int tid, ii, offset; @@ -59,27 +59,27 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -39,14 +39,14 @@ class CoulDebye : public BaseCharge { * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_scale, double **host_cutsq, double *host_special_coul, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const double qqrd2e, const double kappa); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_scale); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -69,7 +69,7 @@ class CoulDebye : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e,_kappa; diff --git a/lib/gpu/lal_coul_debye_ext.cpp b/lib/gpu/lal_coul_debye_ext.cpp index ced08b63e4..f205cd6adf 100644 --- a/lib/gpu/lal_coul_debye_ext.cpp +++ b/lib/gpu/lal_coul_debye_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndtrung@umich.edu ***************************************************************************/ @@ -75,7 +75,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq, maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa); CDEMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -93,16 +93,16 @@ void cdebye_gpu_reinit(const int ntypes, double **host_scale) { int world_me=CDEMF.device->world_me(); int gpu_rank=CDEMF.device->gpu_rank(); int procs_per_gpu=CDEMF.device->procs_per_gpu(); - + if (world_me==0) CDEMF.reinit(ntypes, host_scale); - + CDEMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } } @@ -123,7 +123,7 @@ int** cdebye_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void cdebye_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_coul_dsf.cpp b/lib/gpu/lal_coul_dsf.cpp index ca81d32b2d..32c4342fbe 100644 --- a/lib/gpu/lal_coul_dsf.cpp +++ b/lib/gpu/lal_coul_dsf.cpp @@ -37,18 +37,18 @@ template CoulDSFT::~CoulDSF() { clear(); } - + template int CoulDSFT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int CoulDSFT::init(const int ntypes, const int nlocal, const int nall, - const int max_nbors, const int maxspecial, +int CoulDSFT::init(const int ntypes, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double e_shift, const double f_shift, + const double qqrd2e, const double e_shift, const double f_shift, const double alpha) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -123,7 +123,7 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -134,15 +134,15 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, + this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, &this->_threads_per_atom); diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu index fc5bf5f138..82c44cd382 100644 --- a/lib/gpu/lal_coul_dsf.cu +++ b/lib/gpu/lal_coul_dsf.cu @@ -31,18 +31,18 @@ texture q_tex; #define MY_PIS (acctyp)1.77245385090551602729 -__kernel void k_coul_dsf(const __global numtyp4 *restrict x_, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, +__kernel void k_coul_dsf(const __global numtyp4 *restrict x_, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_ , const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp e_shift, const numtyp f_shift, + const numtyp e_shift, const numtyp f_shift, const numtyp alpha, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -60,19 +60,19 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * + acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } @@ -102,9 +102,9 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_, numtyp erfcd = ucl_exp(-alpha*alpha*rsq); numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r); erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd; - forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + + forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + rsq*f_shift-factor_coul); - + force = forcecoul * r2inv; f.x+=delx*force; @@ -131,17 +131,17 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, +__kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp e_shift, const numtyp f_shift, + const numtyp e_shift, const numtyp f_shift, const numtyp alpha, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -149,7 +149,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, __local numtyp sp_lj[4]; if (tid<4) sp_lj[tid]=sp_lj_in[tid]; - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -157,25 +157,25 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * + acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } - + for ( ; nbor { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, const int nlocal, const int nall, - const int max_nbors, const int maxspecial, + int init(const int ntypes, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double e_shift, const double f_shift, + const double qqrd2e, const double e_shift, const double f_shift, const double alpha); /// Clear all host and device data @@ -62,7 +62,7 @@ class CoulDSF : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e; diff --git a/lib/gpu/lal_coul_dsf_ext.cpp b/lib/gpu/lal_coul_dsf_ext.cpp index e65a090a16..174ec0d839 100644 --- a/lib/gpu/lal_coul_dsf_ext.cpp +++ b/lib/gpu/lal_coul_dsf_ext.cpp @@ -27,11 +27,11 @@ static CoulDSF CDMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int cdsf_gpu_init(const int ntypes, const int inum, const int nall, +int cdsf_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, - const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double e_shift, const double f_shift, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double e_shift, const double f_shift, const double alpha) { CDMF.clear(); gpu_mode=CDMF.device->gpu_mode(); @@ -55,8 +55,8 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall, int init_ok=0; if (world_me==0) - init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_cut_coulsq, host_special_coul, + init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, e_shift, f_shift, alpha); CDMF.device->world_barrier(); @@ -73,12 +73,12 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_cut_coulsq, host_special_coul, + init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, e_shift, f_shift, alpha); CDMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -95,7 +95,7 @@ void cdsf_gpu_clear() { int** cdsf_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -105,7 +105,7 @@ int** cdsf_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void cdsf_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_coul_ext.cpp b/lib/gpu/lal_coul_ext.cpp index 291546d5b1..c124622cee 100644 --- a/lib/gpu/lal_coul_ext.cpp +++ b/lib/gpu/lal_coul_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndtrung@umich.edu ***************************************************************************/ @@ -75,7 +75,7 @@ int coul_gpu_init(const int ntypes, double **host_scale, maxspecial, cell_size, gpu_split, screen, qqrd2e); COULMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -93,16 +93,16 @@ void coul_gpu_reinit(const int ntypes, double **host_scale) { int world_me=COULMF.device->world_me(); int gpu_rank=COULMF.device->gpu_rank(); int procs_per_gpu=COULMF.device->procs_per_gpu(); - + if (world_me==0) COULMF.reinit(ntypes, host_scale); - + COULMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } } @@ -113,7 +113,7 @@ void coul_gpu_clear() { int** coul_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -123,7 +123,7 @@ int** coul_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void coul_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_coul_long.cpp b/lib/gpu/lal_coul_long.cpp index d6e16a9668..513e6d074d 100644 --- a/lib/gpu/lal_coul_long.cpp +++ b/lib/gpu/lal_coul_long.cpp @@ -36,7 +36,7 @@ template CoulLongT::~CoulLong() { clear(); } - + template int CoulLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -67,13 +67,13 @@ int CoulLongT::init(const int ntypes, double **host_scale, // Allocate a host write buffer for data initialization UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; iucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale); - + sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_coul[i]; @@ -129,7 +129,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -141,13 +141,13 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, + &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu index 12bbbee7d2..365195e00c 100644 --- a/lib/gpu/lal_coul_long.cu +++ b/lib/gpu/lal_coul_long.cu @@ -123,16 +123,16 @@ texture q_tex; #endif -__kernel void k_coul_long(const __global numtyp4 *restrict x_, +__kernel void k_coul_long(const __global numtyp4 *restrict x_, const __global numtyp *restrict scale, const int lj_types, - const __global numtyp *restrict sp_cl_in, + const __global numtyp *restrict sp_cl_in, const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { @@ -216,15 +216,15 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, +__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, const __global numtyp *restrict scale_in, const __global numtyp *restrict sp_cl_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, + __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { diff --git a/lib/gpu/lal_coul_long.h b/lib/gpu/lal_coul_long.h index 52ed60111b..6ed9c1a018 100644 --- a/lib/gpu/lal_coul_long.h +++ b/lib/gpu/lal_coul_long.h @@ -30,7 +30,7 @@ class CoulLong : public BaseCharge { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -43,10 +43,10 @@ class CoulLong : public BaseCharge { const double gpu_split, FILE *screen, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **scale); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp index 5552dc2437..2bc2af082e 100644 --- a/lib/gpu/lal_coul_long_ext.cpp +++ b/lib/gpu/lal_coul_long_ext.cpp @@ -95,16 +95,16 @@ void cl_gpu_reinit(const int ntypes, double **host_scale) { int world_me=CLMF.device->world_me(); int gpu_rank=CLMF.device->gpu_rank(); int procs_per_gpu=CLMF.device->procs_per_gpu(); - + if (world_me==0) CLMF.reinit(ntypes, host_scale); - + CLMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } } diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index f326657e31..1943de64c6 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -45,8 +45,8 @@ DeviceT::~Device() { template int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, - const int last_gpu, const int gpu_mode, - const double p_split, const int nthreads, + const int last_gpu, const int gpu_mode, + const double p_split, const int nthreads, const int t_per_atom, const double cell_size, char *ocl_vendor, const int block_pair) { _nthreads=nthreads; @@ -83,8 +83,8 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names, MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world); std::string node_string=std::string(node_name); - - // Get the number of procs per node + + // Get the number of procs per node std::map name_map; std::map::iterator np; for (int i=0; i<_world_size; i++) { @@ -104,12 +104,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, split_id=split_num; split_num++; } - + // Set up a per node communicator and find rank within MPI_Comm node_comm; - MPI_Comm_split(_comm_world, split_id, 0, &node_comm); + MPI_Comm_split(_comm_world, split_id, 0, &node_comm); int node_rank; - MPI_Comm_rank(node_comm,&node_rank); + MPI_Comm_rank(node_comm,&node_rank); // set the device ID _procs_per_gpu=static_cast(ceil(static_cast(procs_per_node)/ @@ -120,7 +120,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, _time_device=true; if (_procs_per_gpu>1) _time_device=false; - + // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); MPI_Comm_rank(_comm_gpu,&_gpu_rank); @@ -128,12 +128,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, gpu=new UCL_Device(); if (my_gpu>=gpu->num_devices()) return -2; - + #ifndef CUDA_PROXY if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false) return -7; #endif - + if (gpu->set(my_gpu)!=UCL_SUCCESS) return -6; @@ -144,7 +144,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, if (set_ocl_params(ocl_vendor)!=0) return -11; - + int flag=0; for (int i=0; i<_procs_per_gpu; i++) { if (_gpu_rank==i) @@ -162,7 +162,7 @@ int DeviceT::set_ocl_params(char *ocl_vendor) { s_vendor=ocl_vendor; if (s_vendor=="none") s_vendor="generic"; - + if (s_vendor=="kepler") { _ocl_vendor_name="NVIDIA Kepler"; #if defined (__APPLE__) || defined(MACOSX) @@ -170,19 +170,19 @@ int DeviceT::set_ocl_params(char *ocl_vendor) { #else _ocl_vendor_string="-DKEPLER_OCL"; #endif - } else if (s_vendor=="fermi") { + } else if (s_vendor=="fermi") { _ocl_vendor_name="NVIDIA Fermi"; _ocl_vendor_string="-DFERMI_OCL"; - } else if (s_vendor=="cypress") { + } else if (s_vendor=="cypress") { _ocl_vendor_name="AMD Cypress"; _ocl_vendor_string="-DCYPRESS_OCL"; - } else if (s_vendor=="phi") { + } else if (s_vendor=="phi") { _ocl_vendor_name="Intel Phi"; _ocl_vendor_string="-DPHI_OCL"; - } else if (s_vendor=="intel") { + } else if (s_vendor=="intel") { _ocl_vendor_name="Intel CPU"; _ocl_vendor_string="-DINTEL_OCL"; - } else if (s_vendor=="generic") { + } else if (s_vendor=="generic") { _ocl_vendor_name="GENERIC"; _ocl_vendor_string="-DGENERIC_OCL"; } else { @@ -220,10 +220,10 @@ int DeviceT::set_ocl_params(char *ocl_vendor) { template int DeviceT::init(Answer &ans, const bool charge, - const bool rot, const int nlocal, + const bool rot, const int nlocal, const int host_nlocal, const int nall, Neighbor *nbor, const int maxspecial, - const int gpu_host, const int max_nbors, + const int gpu_host, const int max_nbors, const double cell_size, const bool pre_cut, const int threads_per_atom, const bool vel) { if (!_device_init) @@ -254,7 +254,7 @@ int DeviceT::init(Answer &ans, const bool charge, // Initialize atom and nbor data if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel)) return -3; - + _data_in_estimate++; if (charge) _data_in_estimate++; @@ -272,12 +272,12 @@ int DeviceT::init(Answer &ans, const bool charge, if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel)) return -3; } - + if (!ans.init(ef_nlocal,charge,rot,*gpu)) return -3; if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, - *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, + *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, _block_cell_id, _block_nbor_build, threads_per_atom, _warp_size, _time_device, compile_string())) return -3; @@ -294,7 +294,7 @@ template int DeviceT::init(Answer &ans, const int nlocal, const int nall) { if (!_device_init) - return -1; + return -1; if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) return -5; @@ -361,7 +361,7 @@ void DeviceT::init_message(FILE *screen, const char *name, if (i==first_gpu) sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+ toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ ("; - else + else sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+ toa(gpu->clock_rate(i))+" GHZ ("; if (sizeof(PRECISION)==4) { @@ -381,7 +381,7 @@ void DeviceT::init_message(FILE *screen, const char *name, } template -void DeviceT::estimate_gpu_overhead(const int kernel_calls, +void DeviceT::estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead, double &gpu_driver_overhead) { UCL_H_Vec *host_data_in=NULL, *host_data_out=NULL; @@ -394,38 +394,38 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls, dev_data_in=new UCL_D_Vec[_data_in_estimate]; timers_in=new UCL_Timer[_data_in_estimate]; } - + if (_data_out_estimate>0) { host_data_out=new UCL_H_Vec[_data_out_estimate]; dev_data_out=new UCL_D_Vec[_data_out_estimate]; timers_out=new UCL_Timer[_data_out_estimate]; } - + if (kernel_calls>0) { kernel_data=new UCL_D_Vec[kernel_calls]; timers_kernel=new UCL_Timer[kernel_calls]; } - + for (int i=0; i<_data_in_estimate; i++) { host_data_in[i].alloc(1,*gpu); dev_data_in[i].alloc(1,*gpu); timers_in[i].init(*gpu); - } - + } + for (int i=0; i<_data_out_estimate; i++) { host_data_out[i].alloc(1,*gpu); dev_data_out[i].alloc(1,*gpu); timers_out[i].init(*gpu); - } - + } + for (int i=0; isync(); gpu_barrier(); @@ -439,7 +439,7 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls, ucl_copy(dev_data_in[i],host_data_in[i],true); timers_in[i].stop(); } - + for (int i=0; i0) { delete [] host_data_out; delete [] dev_data_out; delete [] timers_out; } - + if (kernel_calls>0) { delete [] kernel_data; delete [] timers_kernel; } -} +} template -void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, - Neighbor &nbor, const double avg_split, +void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, + Neighbor &nbor, const double avg_split, const double max_bytes, const double gpu_overhead, - const double driver_overhead, + const double driver_overhead, const int threads_per_atom, FILE *screen) { double single[9], times[9]; int post_final=0; @@ -557,14 +557,14 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, } template -void DeviceT::output_kspace_times(UCL_Timer &time_in, +void DeviceT::output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out, UCL_Timer &time_map, UCL_Timer &time_rho, UCL_Timer &time_interp, - Answer &ans, - const double max_bytes, - const double cpu_time, + Answer &ans, + const double max_bytes, + const double cpu_time, const double idle_time, FILE *screen) { double single[8], times[8]; @@ -664,7 +664,7 @@ int DeviceT::compile_kernels() { k_info.set_size(1,1); k_info.run(&gpu_lib_data); gpu_lib_data.update_host(false); - + _ptx_arch=static_cast(gpu_lib_data[0])/100.0; #ifndef USE_OPENCL if (_ptx_arch>gpu->arch() || floor(_ptx_arch)arch())) @@ -705,7 +705,7 @@ int DeviceT::compile_kernels() { if (_threads_per_charge & (_threads_per_charge - 1)) _threads_per_charge=1; - return flag; + return flag; } template @@ -718,12 +718,12 @@ template class Device; Device global_device; int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, - const int last_gpu, const int gpu_mode, + const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads, - const int t_per_atom, const double cell_size, + const int t_per_atom, const double cell_size, char *opencl_vendor, const int block_pair) { return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, - particle_split,nthreads,t_per_atom, + particle_split,nthreads,t_per_atom, cell_size,opencl_vendor,block_pair); } diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu index 28b58f7760..6761b23fbb 100644 --- a/lib/gpu/lal_device.cu +++ b/lib/gpu/lal_device.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -17,10 +17,10 @@ #include "lal_preprocessor.h" #endif -__kernel void kernel_zero(__global int *restrict mem, +__kernel void kernel_zero(__global int *restrict mem, int numel) { int ii=GLOBAL_ID_X; - + if (ii class PPPM; template class Device { public: Device(); - ~Device(); - + ~Device(); + /// Initialize the device for use by this process /** Sets up a per-device MPI communicator for load balancing and initializes - * the device (>=first_gpu and <=last_gpu) that this proc will be using + * the device (>=first_gpu and <=last_gpu) that this proc will be using * Returns: * - 0 if successfull * - -2 if GPU not found * - -4 if GPU library not compiled for GPU * - -6 if GPU could not be initialized for use - * - -7 if accelerator sharing is not currently allowed on system + * - -7 if accelerator sharing is not currently allowed on system * - -11 if vendor_string has the wrong number of parameters **/ - int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, - const int last_gpu, const int gpu_mode, + int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, + const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads, - const int t_per_atom, const double cell_size, + const int t_per_atom, const double cell_size, char *vendor_string, const int block_pair); /// Initialize the device for Atom and Neighbor storage @@ -62,9 +62,9 @@ class Device { * 1 if gpu_nbor is true, and host needs a half nbor list, * 2 if gpu_nbor is true, and host needs a full nbor list * \param max_nbors Initial number of rows in the neighbor matrix - * \param cell_size cutoff+skin + * \param cell_size cutoff+skin * \param pre_cut True if cutoff test will be performed in separate kernel - * than the force kernel + * than the force kernel * \param threads_per_atom value to be used by the neighbor list only * * Returns: @@ -113,25 +113,25 @@ class Device { /// Returns true if double precision is supported on card inline bool double_precision() { return gpu->double_precision(); } - + /// Output a message with timing information - void output_times(UCL_Timer &time_pair, Answer &ans, - Neighbor &nbor, const double avg_split, + void output_times(UCL_Timer &time_pair, Answer &ans, + Neighbor &nbor, const double avg_split, const double max_bytes, const double gpu_overhead, - const double driver_overhead, + const double driver_overhead, const int threads_per_atom, FILE *screen); /// Output a message with timing information void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out, UCL_Timer & time_map, UCL_Timer & time_rho, - UCL_Timer &time_interp, - Answer &ans, + UCL_Timer &time_interp, + Answer &ans, const double max_bytes, const double cpu_time, const double cpu_idle_time, FILE *screen); /// Clear all memory on host and device associated with atom and nbor data void clear(); - + /// Clear all memory on host and device void clear_device(); @@ -149,24 +149,24 @@ class Device { while (ans_queue.empty()==false) { evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul); ans_queue.pop(); - } + } return evdw; } return 0.0; } /// Start timer on host - inline void start_host_timer() + inline void start_host_timer() { _cpu_full=MPI_Wtime(); _host_timer_started=true; } - + /// Stop timer on host - inline void stop_host_timer() { + inline void stop_host_timer() { if (_host_timer_started) { - _cpu_full=MPI_Wtime()-_cpu_full; + _cpu_full=MPI_Wtime()-_cpu_full; _host_timer_started=false; } } - + /// Return host time inline double host_time() { return _cpu_full; } @@ -239,8 +239,8 @@ class Device { /// Number of threads executing concurrently on same multiproc inline int warp_size() const { return _warp_size; } - // -------------------- SHARED DEVICE ROUTINES -------------------- - // Perform asynchronous zero of integer array + // -------------------- SHARED DEVICE ROUTINES -------------------- + // Perform asynchronous zero of integer array void zero(UCL_D_Vec &mem, const int numel) { int num_blocks=static_cast(ceil(static_cast(numel)/ _block_pair)); @@ -248,25 +248,25 @@ class Device { k_zero.run(&mem,&numel); } - // -------------------------- DEVICE DATA ------------------------- + // -------------------------- DEVICE DATA ------------------------- /// Geryon Device UCL_Device *gpu; enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH}; - // --------------------------- ATOM DATA -------------------------- + // --------------------------- ATOM DATA -------------------------- /// Atom Data Atom atom; // --------------------------- NBOR DATA ---------------------------- - + /// Neighbor Data NeighborShared _neighbor_shared; // ------------------------ LONG RANGE DATA ------------------------- - + // Long Range Data int _long_range_precompute; PPPM *pppm_single; @@ -282,7 +282,7 @@ class Device { pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge, boxlo,prd); } - + inline std::string compile_string() { return _ocl_compile_string; } private: @@ -290,7 +290,7 @@ class Device { int _init_count; bool _device_init, _host_timer_started, _time_device; MPI_Comm _comm_world, _comm_replica, _comm_gpu; - int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, + int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, _replica_size; int _gpu_mode, _first_device, _last_device, _nthreads; double _particle_split; @@ -310,10 +310,10 @@ class Device { int compile_kernels(); int _data_in_estimate, _data_out_estimate; - + std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string; int set_ocl_params(char *); - + template inline std::string toa(const t& in) { std::ostringstream o; diff --git a/lib/gpu/lal_dipole_lj.cpp b/lib/gpu/lal_dipole_lj.cpp index e96e15eaf9..c97b76c820 100644 --- a/lib/gpu/lal_dipole_lj.cpp +++ b/lib/gpu/lal_dipole_lj.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template DipoleLJT::~DipoleLJ() { clear(); } - + template int DipoleLJT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,8 +45,8 @@ int DipoleLJT::bytes_per_atom(const int max_nbors) const { template int DipoleLJT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, @@ -138,7 +138,7 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -151,7 +151,7 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) { &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->q, + &ainum, &nbor_pitch, &this->atom->q, &this->atom->quat, &cutsq, &_qqrd2e, &this->_threads_per_atom); } else { @@ -160,8 +160,8 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) { &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->atom->q, - &this->atom->quat, &cutsq, + &nbor_pitch, &this->atom->q, + &this->atom->quat, &cutsq, &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu index b6483d1ef8..42c2bde144 100644 --- a/lib/gpu/lal_dipole_lj.cu +++ b/lib/gpu/lal_dipole_lj.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -132,17 +132,17 @@ texture mu_tex; #endif -__kernel void k_dipole_lj(const __global numtyp4 *restrict x_, +__kernel void k_dipole_lj(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, const __global numtyp4 *restrict mu_, const __global numtyp *restrict cutsq, @@ -171,14 +171,14 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii (numtyp)0.0 && muj.w > (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; r7inv = r5inv*r2inv; pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; @@ -251,7 +251,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x; forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y; forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z; - + numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y); numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z); numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x); @@ -263,12 +263,12 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, // dipole-charge if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pre1 = (numtyp)3.0*qj*r5inv * pidotr; pre2 = qj*r3inv; - + forcecoul.x += pre2*mui.x - pre1*delx; forcecoul.y += pre2*mui.y - pre1*dely; forcecoul.z += pre2*mui.z - pre1*delz; @@ -276,7 +276,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, ticoul.y += pre2 * (mui.z*delx - mui.x*delz); ticoul.z += pre2 * (mui.x*dely - mui.y*delx); } - + // charge-dipole if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { r3inv = r2inv*rinv; @@ -284,7 +284,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr; pre2 = qtmp*r3inv; - + forcecoul.x += pre1*delx - pre2*muj.x; forcecoul.y += pre1*dely - pre2*muj.y; forcecoul.z += pre1*delz - pre2*muj.z; @@ -306,12 +306,12 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, tor.z+=fq*ticoul.z; if (eflag>0) { - acctyp e = (acctyp)0.0; + acctyp e = (acctyp)0.0; if (rsq < lj1[mtype].w) { e = qtmp*qj*rinv; if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr; - if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) e += -qj*r3inv*pidotr; if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) e += qtmp*r3inv*pjdotr; @@ -322,7 +322,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, if (rsq < lj1[mtype].z) { e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); - } + } } if (vflag>0) { virial[0] += delx*force.x; @@ -340,19 +340,19 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, +__kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const __global numtyp *restrict q_, const __global numtyp4 *restrict mu_, - const __global numtyp *restrict _cutsq, + const __global numtyp *restrict _cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -369,7 +369,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -381,16 +381,16 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii (numtyp)0.0 && muj.w > (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; r7inv = r5inv*r2inv; pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; @@ -463,7 +463,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x; forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y; forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z; - + numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y); numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z); numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x); @@ -474,13 +474,13 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, } // dipole-charge - if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { r3inv = r2inv*rinv; r5inv = r3inv*r2inv; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pre1 = (numtyp)3.0*qj*r5inv * pidotr; pre2 = qj*r3inv; - + forcecoul.x += pre2*mui.x - pre1*delx; forcecoul.y += pre2*mui.y - pre1*dely; forcecoul.z += pre2*mui.z - pre1*delz; @@ -488,7 +488,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, ticoul.y += pre2 * (mui.z*delx - mui.x*delz); ticoul.z += pre2 * (mui.x*dely - mui.y*delx); } - + // charge-dipole if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { r3inv = r2inv*rinv; @@ -496,7 +496,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr; pre2 = qtmp*r3inv; - + forcecoul.x += pre1*delx - pre2*muj.x; forcecoul.y += pre1*dely - pre2*muj.y; forcecoul.z += pre1*delz - pre2*muj.z; @@ -519,12 +519,12 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, tor.z+=fq*ticoul.z; if (eflag>0) { - acctyp e = (acctyp)0; + acctyp e = (acctyp)0; if (rsq < lj1[mtype].w) { e = qtmp*qj*rinv; if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr; - if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) e += -qj*r3inv*pidotr; if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) e += qtmp*r3inv*pjdotr; diff --git a/lib/gpu/lal_dipole_lj.h b/lib/gpu/lal_dipole_lj.h index b08b7a8669..615784ee8b 100644 --- a/lib/gpu/lal_dipole_lj.h +++ b/lib/gpu/lal_dipole_lj.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ class DipoleLJ : public BaseDipole { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,7 +40,7 @@ class DipoleLJ : public BaseDipole { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, @@ -70,7 +70,7 @@ class DipoleLJ : public BaseDipole { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e; diff --git a/lib/gpu/lal_dipole_lj_ext.cpp b/lib/gpu/lal_dipole_lj_ext.cpp index 55bbe0b804..2591d3c0ed 100644 --- a/lib/gpu/lal_dipole_lj_ext.cpp +++ b/lib/gpu/lal_dipole_lj_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -81,7 +81,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, qqrd2e); DPLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -98,17 +98,17 @@ void dpl_gpu_clear() { int** dpl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double **host_mu, + bool &success, double *host_q, double **host_mu, double *boxlo, double *prd) { return DPLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, host_mu, boxlo, prd); -} +} void dpl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_dipole_lj_sf.cpp b/lib/gpu/lal_dipole_lj_sf.cpp index 5a145dc762..a33f38084f 100644 --- a/lib/gpu/lal_dipole_lj_sf.cpp +++ b/lib/gpu/lal_dipole_lj_sf.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template DipoleLJSFT::~DipoleLJSF() { clear(); } - + template int DipoleLJSFT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,8 +45,8 @@ int DipoleLJSFT::bytes_per_atom(const int max_nbors) const { template int DipoleLJSFT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, @@ -138,7 +138,7 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -151,17 +151,17 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) { &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->q, + &ainum, &nbor_pitch, &this->atom->q, &this->atom->quat, &cutsq, &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->q, - &this->atom->quat, &cutsq, + &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, + &this->atom->quat, &cutsq, &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu index 8469ed9ac9..5769c3a1a1 100644 --- a/lib/gpu/lal_dipole_lj_sf.cu +++ b/lib/gpu/lal_dipole_lj_sf.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -133,20 +133,20 @@ texture mu_tex; #endif -__kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, +__kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_ , const __global numtyp4 *restrict mu_, - const __global numtyp *restrict cutsq, + const __global numtyp *restrict cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -172,14 +172,14 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii (numtyp)0.0 && muj.w > (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; - + pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; - + afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv; pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr); aforcecoul.x = pre1*delx; aforcecoul.y = pre1*dely; aforcecoul.z = pre1*delz; - + bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+ (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv; presf = (numtyp)2.0*r2inv*pidotr*pjdotr; bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx); bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely); bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz); - + forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x); forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y); forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z); - + pre2 = (numtyp)3.0*bfac*r5inv*pjdotr; pre4 = -bfac*r3inv; numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y); numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z); numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x); - + ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely); ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz); ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx); @@ -285,12 +285,12 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, // dipole-charge if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; rcutcoul2inv=ucl_recip(lj1[mtype].w); pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv); - pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + + pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv); pre2 = qj*r3inv * pqfac; @@ -301,7 +301,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, ticoul.y += pre2 * (mui.z*delx - mui.x*delz); ticoul.z += pre2 * (mui.x*dely - mui.y*delx); } - + // charge-dipole if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { r3inv = r2inv*rinv; @@ -309,10 +309,10 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; rcutcoul2inv=ucl_recip(lj1[mtype].w); pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv); - qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + + qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv); pre2 = qtmp*r3inv * qpfac; - + forcecoul.x += pre1*delx - pre2*muj.x; forcecoul.y += pre1*dely - pre2*muj.y; forcecoul.z += pre1*delz - pre2*muj.z; @@ -334,13 +334,13 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, tor.z+=fq*ticoul.z; if (eflag>0) { - acctyp e = (acctyp)0.0; + acctyp e = (acctyp)0.0; if (rsq < lj1[mtype].w) { numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv); e = qtmp*qj*rinv*fac*fac; if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr); - if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) e += -qj*r3inv*pidotr * pqfac; if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) e += qtmp*r3inv*pjdotr * qpfac; @@ -350,12 +350,12 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, if (rsq < lj1[mtype].z) { e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) + - rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - + rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv + - rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + + rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + (numtyp)4.0*lj3[mtype].y); energy+=factor_lj*e; - } + } } if (vflag>0) { virial[0] += delx*force.x; @@ -372,19 +372,19 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, +__kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const __global numtyp *restrict q_, const __global numtyp4 *restrict mu_, - const __global numtyp *restrict _cutsq, + const __global numtyp *restrict _cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; @@ -402,7 +402,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -414,16 +414,16 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii (numtyp)0.0 && muj.w > (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; - + pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; - + afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv; pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr); aforcecoul.x = pre1*delx; aforcecoul.y = pre1*dely; aforcecoul.z = pre1*delz; - + bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+ (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv; presf = (numtyp)2.0*r2inv*pidotr*pjdotr; bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx); bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely); bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz); - + forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x); forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y); forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z); - + pre2 = (numtyp)3.0*bfac*r5inv*pjdotr; pre4 = -bfac*r3inv; @@ -529,11 +529,11 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, // dipole-charge if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv); - pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + + pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv); pre2 = qj*r3inv * pqfac; @@ -544,7 +544,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, ticoul.y += pre2 * (mui.z*delx - mui.x*delz); ticoul.z += pre2 * (mui.x*dely - mui.y*delx); } - + // charge-dipole if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { r3inv = r2inv*rinv; @@ -552,10 +552,10 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv); - qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + + qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv); pre2 = qtmp*r3inv * qpfac; - + forcecoul.x += pre1*delx - pre2*muj.x; forcecoul.y += pre1*dely - pre2*muj.y; forcecoul.z += pre1*delz - pre2*muj.z; @@ -577,13 +577,13 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, tor.z+=fq*ticoul.z; if (eflag>0) { - acctyp e = (acctyp)0.0; + acctyp e = (acctyp)0.0; if (rsq < lj1[mtype].w) { numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv); e = qtmp*qj*rinv*fac*fac; if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr); - if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) e += -qj*r3inv*pidotr * pqfac; if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) e += qtmp*r3inv*pjdotr * qpfac; @@ -593,12 +593,12 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, if (rsq < lj1[mtype].z) { e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) + - rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - + rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv + - rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + + rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + (numtyp)4.0*lj3[mtype].y); energy+=factor_lj*e; - } + } } if (vflag>0) { virial[0] += delx*force.x; diff --git a/lib/gpu/lal_dipole_lj_sf.h b/lib/gpu/lal_dipole_lj_sf.h index 83cea4c2a4..20357385a2 100644 --- a/lib/gpu/lal_dipole_lj_sf.h +++ b/lib/gpu/lal_dipole_lj_sf.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ class DipoleLJSF : public BaseDipole { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,7 +40,7 @@ class DipoleLJSF : public BaseDipole { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, @@ -70,7 +70,7 @@ class DipoleLJSF : public BaseDipole { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e; diff --git a/lib/gpu/lal_dipole_lj_sf_ext.cpp b/lib/gpu/lal_dipole_lj_sf_ext.cpp index 8abf78c903..840afbe1c2 100644 --- a/lib/gpu/lal_dipole_lj_sf_ext.cpp +++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -81,7 +81,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, qqrd2e); DPLSFMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -98,17 +98,17 @@ void dplsf_gpu_clear() { int** dplsf_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double **host_mu, + bool &success, double *host_q, double **host_mu, double *boxlo, double *prd) { return DPLSFMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, host_mu, boxlo, prd); -} +} void dplsf_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_dpd.cpp b/lib/gpu/lal_dpd.cpp index 3736f89323..f05707ef1d 100644 --- a/lib/gpu/lal_dpd.cpp +++ b/lib/gpu/lal_dpd.cpp @@ -33,23 +33,23 @@ DPDT::DPD() : BaseDPD(), _allocated(false) { } template -DPDT::~DPD() { +DPDT::~DPD() { clear(); } - + template int DPDT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int DPDT::init(const int ntypes, - double **host_cutsq, double **host_a0, - double **host_gamma, double **host_sigma, +int DPDT::init(const int ntypes, + double **host_cutsq, double **host_a0, + double **host_gamma, double **host_sigma, double **host_cut, double *host_special_lj, - const bool tstat_only, - const int nlocal, const int nall, - const int max_nbors, const int maxspecial, + const bool tstat_only, + const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; @@ -90,7 +90,7 @@ int DPDT::init(const int ntypes, _tstat_only = 0; if (tstat_only) _tstat_only=1; - + _allocated=true; this->_max_bytes=coeff.row_bytes()+cutsq.row_bytes()+sp_lj.row_bytes(); return 0; @@ -130,7 +130,7 @@ void DPDT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -147,8 +147,8 @@ void DPDT::loop(const bool _eflag, const bool _vflag) { &this->_tstat_only, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, &this->_timestep, &this->_tstat_only, @@ -166,5 +166,5 @@ void DPDT::update_coeff(int ntypes, double **host_a0, double **host_gamma, this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_a0,host_gamma, host_sigma,host_cut); } - + template class DPD; diff --git a/lib/gpu/lal_dpd.cu b/lib/gpu/lal_dpd.cu index 209bc0233e..e32404ff5c 100644 --- a/lib/gpu/lal_dpd.cu +++ b/lib/gpu/lal_dpd.cu @@ -37,7 +37,7 @@ texture vel_tex; #define _USE_UNIFORM_SARU_LCG #endif -// References: +// References: // 1. Y. Afshar, F. Schmid, A. Pishevar, S. Worley, Comput. Phys. Comm. 184 (2013), 1119–1128. // 2. C. L. Phillips, J. A. Anderson, S. C. Glotzer, Comput. Phys. Comm. 230 (2011), 7191-7201. // PRNG period = 3666320093*2^32 ~ 2^64 ~ 10^19 @@ -49,9 +49,9 @@ texture vel_tex; #define TWO_N32 0.232830643653869628906250e-9f /* 2^-32 */ // specifically implemented for steps = 1; high = 1.0; low = -1.0 -// returns uniformly distributed random numbers u in [-1.0;1.0] -// using the inherent LCG, then multiply u with sqrt(3) to "match" -// with a normal random distribution. +// returns uniformly distributed random numbers u in [-1.0;1.0] +// using the inherent LCG, then multiply u with sqrt(3) to "match" +// with a normal random distribution. // Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) // Curly brackets to make variables local to the scope. #ifdef _USE_UNIFORM_SARU_LCG @@ -80,8 +80,8 @@ texture vel_tex; #endif // specifically implemented for steps = 1; high = 1.0; low = -1.0 -// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 -// then multiply u with sqrt(3) to "match" with a normal random distribution +// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 +// then multiply u with sqrt(3) to "match" with a normal random distribution // Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) #ifdef _USE_UNIFORM_SARU_TEA8 #define SQRT3 (numtyp)1.7320508075688772935274463 @@ -119,7 +119,7 @@ texture vel_tex; #endif // specifically implemented for steps = 1; high = 1.0; low = -1.0 -// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], +// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], // and uses the polar method (Marsaglia's) to transform to a normal random value // This is used to compared with CPU DPD using RandMars::gaussian() #ifdef _USE_GAUSSIAN_SARU_LCG @@ -160,20 +160,20 @@ texture vel_tex; randnum = r2*fac; \ } #endif - -__kernel void k_dpd(const __global numtyp4 *restrict x_, + +__kernel void k_dpd(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff, - const int lj_types, - const __global numtyp *restrict sp_lj, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp4 *restrict v_, const __global numtyp *restrict cutsq, - const numtyp dtinvsqrt, const int seed, + const numtyp dtinvsqrt, const int seed, const int timestep, const int tstat_only, const int t_per_atom) { int tid, ii, offset; @@ -185,13 +185,13 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii tag2) { tag1 = jtag; tag2 = itag; } - - numtyp randnum = (numtyp)0.0; + + numtyp randnum = (numtyp)0.0; saru(tag1, tag2, seed, timestep, randnum); // conservative force = a0 * wd, or 0 if tstat only @@ -244,7 +244,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_, force -= coeff[mtype].y*wd*wd*dot*rinv; force += coeff[mtype].z*wd*randnum*dtinvsqrt; force*=factor_dpd*rinv; - + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; @@ -254,7 +254,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_, // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]); // eng shifted to 0.0 at cutoff numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd; - energy+=factor_dpd*e; + energy+=factor_dpd*e; } if (vflag>0) { virial[0] += delx*delx*force; @@ -272,23 +272,23 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_dpd_fast(const __global numtyp4 *restrict x_, +__kernel void k_dpd_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff_in, - const __global numtyp *restrict sp_lj_in, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp4 *restrict v_, const __global numtyp *restrict cutsq, - const numtyp dtinvsqrt, const int seed, + const numtyp dtinvsqrt, const int seed, const int timestep, const int tstat_only, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -296,7 +296,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_, if (tid tag2) { tag1 = jtag; tag2 = itag; } - - numtyp randnum = (numtyp)0.0; + + numtyp randnum = (numtyp)0.0; saru(tag1, tag2, seed, timestep, randnum); // conservative force = a0 * wd, or 0 if tstat only @@ -364,7 +364,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_, force -= coeff[mtype].y*wd*wd*dot*rinv; force += coeff[mtype].z*wd*randnum*dtinvsqrt; force*=factor_dpd*rinv; - + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; @@ -374,7 +374,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_, // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]); // eng shifted to 0.0 at cutoff numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd; - energy+=factor_dpd*e; + energy+=factor_dpd*e; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_dpd.h b/lib/gpu/lal_dpd.h index 449d7b1d8c..42ef854522 100644 --- a/lib/gpu/lal_dpd.h +++ b/lib/gpu/lal_dpd.h @@ -24,23 +24,23 @@ template class DPD : public BaseDPD { public: DPD(); - ~DPD(); + ~DPD(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, double **host_cutsq, double **host_a0, + int init(const int ntypes, double **host_cutsq, double **host_a0, double **host_gamma, double **host_sigma, double **host_cut, double *host_special_lj, bool tstat_only, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, + const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -52,11 +52,11 @@ class DPD : public BaseDPD { /// Total host memory used by library for pair style double host_memory_usage() const; - + /// Update coeff if needed (tstat only) void update_coeff(int ntypes, double **host_a0, double **host_gamma, double **host_sigma, double **host_cut); - + // --------------------------- TYPE DATA -------------------------- /// coeff.x = a0, coeff.y = gamma, coeff.z = sigma, coeff.w = cut @@ -70,12 +70,12 @@ class DPD : public BaseDPD { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; - + /// Only used for thermostat int _tstat_only; - + private: bool _allocated; void loop(const bool _eflag, const bool _vflag); diff --git a/lib/gpu/lal_dpd_ext.cpp b/lib/gpu/lal_dpd_ext.cpp index 327074d087..792f638cd8 100644 --- a/lib/gpu/lal_dpd_ext.cpp +++ b/lib/gpu/lal_dpd_ext.cpp @@ -54,7 +54,7 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, int init_ok=0; if (world_me==0) - init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, + init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, host_cut, special_lj, tstat_only, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -72,12 +72,12 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, + init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, host_cut, special_lj, tstat_only, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); DPDMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -95,25 +95,25 @@ void dpd_gpu_clear() { int ** dpd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, - double **host_v, const double dtinvsqrt, + double **host_v, const double dtinvsqrt, const int seed, const int timestep, double *boxlo, double *prd) { return DPDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, ilist, jnum, cpu_time, success, + vatom, host_start, ilist, jnum, cpu_time, success, host_v, dtinvsqrt, seed, timestep, boxlo, prd); -} +} void dpd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, tagint *tag, - double **host_v, const double dtinvsqrt, - const int seed, const int timestep, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, const int nlocal, double *boxlo, double *prd) { DPDMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp index c856a8e667..b83972f4db 100644 --- a/lib/gpu/lal_eam.cpp +++ b/lib/gpu/lal_eam.cpp @@ -9,10 +9,10 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ - + #if defined(USE_OPENCL) #include "eam_cl.h" #elif defined(USE_CUDART) @@ -33,7 +33,7 @@ using namespace LAMMPS_AL; extern Device device; template -EAMT::EAM() : BaseAtomic(), +EAMT::EAM() : BaseAtomic(), _compiled_energy(false), _allocated(false) { } @@ -41,46 +41,46 @@ template EAMT::~EAM() { clear(); } - + template int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, - double ***host_frho_spline, double rdr, double rdrho, + double ***host_frho_spline, double rdr, double rdrho, double rhomax, int nrhor, int nrho, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen) + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, gpu_split,_screen,eam,"k_eam"); - + if (success!=0) return success; - + // allocate fp - + int ef_nall=nall; if (ef_nall==0) ef_nall=2000; _max_fp_size=static_cast(static_cast(ef_nall)*1.10); _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - + k_energy.set_function(*(this->pair_program),"k_energy"); k_energy_fast.set_function(*(this->pair_program),"k_energy_fast"); fp_tex.get_texture(*(this->pair_program),"fp_tex"); fp_tex.bind_float(_fp,1); _compiled_energy = true; - + // Initialize timers for selected GPU time_pair2.init(*(this->ucl_device)); time_pair2.zero(); - + time_fp1.init(*(this->ucl_device)); time_fp1.zero(); - + time_fp2.init(*(this->ucl_device)); time_fp2.zero(); @@ -93,7 +93,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, lj_types=max_shared_types; shared_types=true; } - + _ntypes=lj_types; _cutforcesq=host_cutforcesq; _rdr=rdr; @@ -104,26 +104,26 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, _nz2r=nz2r; _nfrho=nfrho; _nr=nr; - + UCL_H_Vec dview_type(lj_types*lj_types,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; iucl_device),UCL_READ_ONLY); - + for (int i=0; i dview_type2frho(lj_types,*(this->ucl_device), UCL_WRITE_ONLY); @@ -136,7 +136,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, // pack frho_spline UCL_H_Vec dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device), UCL_WRITE_ONLY); - + for (int ix=0; ix dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device), UCL_WRITE_ONLY); - + for (int ix=0; ix dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device), UCL_WRITE_ONLY); - + for (int ix=0; ixucl_device),UCL_READ_ONLY); ucl_copy(z2r_spline1,dview_z2r_spline,false); z2r_spline1_tex.get_texture(*(this->pair_program),"z2r_sp1_tex"); z2r_spline1_tex.bind_float(z2r_spline1,4); - + for (int ix=0; ixucl_device),UCL_READ_ONLY); ucl_copy(z2r_spline2,dview_z2r_spline,false); z2r_spline2_tex.get_texture(*(this->pair_program),"z2r_sp2_tex"); @@ -241,7 +241,7 @@ void EAMT::clear() { if (!_allocated) return; _allocated=false; - + type2rhor_z2r.clear(); type2frho.clear(); rhor_spline1.clear(); @@ -250,13 +250,13 @@ void EAMT::clear() { frho_spline2.clear(); z2r_spline1.clear(); z2r_spline2.clear(); - + _fp.clear(); - + time_pair2.clear(); time_fp1.clear(); time_fp2.clear(); - + if (_compiled_energy) { k_energy_fast.clear(); k_energy.clear(); @@ -283,20 +283,20 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, int &host_start, const double cpu_time, bool &success, void **fp_ptr) { this->acc_timers(); - + if (this->device->time_device()) { // Put time from the second part to the total time_pair this->time_pair.add_time_to_total(time_pair2.time()); - + // Add transfer time from device -> host after part 1 this->atom->add_transfer_time(time_fp1.time()); - + // Add transfer time from host -> device before part 2 this->atom->add_transfer_time(time_fp2.time()); } - + // ------------------- Resize FP Array for EAM -------------------- - + if (nall>_max_fp_size) { _max_fp_size=static_cast(static_cast(nall)*1.10); _fp.resize(_max_fp_size); @@ -313,7 +313,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, this->zero_timers(); return; } - + int ago=this->hd_balancer.ago_first(f_ago); int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); this->ans->inum(inum); @@ -326,7 +326,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, if (!success) return; } - + this->atom->cast_x_data(host_x,host_type); this->atom->add_x_data(host_x,host_type); @@ -345,36 +345,36 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, // --------------------------------------------------------------------------- template int** EAMT::compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, + double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, int &inum, + const double cpu_time, bool &success, int &inum, void **fp_ptr) { this->acc_timers(); - + if (this->device->time_device()) { // Put time from the second part to the total time_pair this->time_pair.add_time_to_total(time_pair2.time()); - + // Add transfer time from device -> host after part 1 this->atom->add_transfer_time(time_fp1.time()); - + // Add transfer time from host -> device before part 2 this->atom->add_transfer_time(time_fp2.time()); } // ------------------- Resize FP Array for EAM -------------------- - + if (nall>_max_fp_size) { _max_fp_size=static_cast(static_cast(nall)*1.10); _fp.resize(_max_fp_size); fp_tex.bind_float(_fp,1); - } - *fp_ptr=_fp.host.begin(); + } + *fp_ptr=_fp.host.begin(); // ----------------------------------------------------------------- - + if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -382,14 +382,14 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall, this->zero_timers(); return NULL; } - + // load balance, returning the atom count on the device (inum) this->hd_balancer.balance(cpu_time); inum=this->hd_balancer.get_gpu_count(ago,inum_full); this->ans->inum(inum); host_start=inum; - - // Build neighbor list on GPU if necessary + + // Build neighbor list on GPU if necessary if (ago==0) { this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); @@ -403,14 +403,14 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall, *jnum=this->nbor->host_acc.begin(); loop(eflag,vflag); - + // copy fp from device to host for comm _nlocal=inum_full; time_fp1.start(); _fp.update_host(inum_full,true); time_fp1.stop(); time_fp1.sync_stop(); - + return this->nbor->host_jlist.begin()-host_start; } @@ -420,20 +420,20 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall, template void EAMT::compute2(int *ilist, const bool eflag, const bool vflag, const bool eatom, const bool vatom) { - if (this->ans->inum()==0) + if (this->ans->inum()==0) return; - + this->hd_balancer.start_timer(); time_fp2.start(); this->add_fp_data(); time_fp2.stop(); - + loop2(eflag,vflag); if (ilist == NULL) this->ans->copy_answers(eflag,vflag,eatom,vatom); else this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist); - + this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); } @@ -455,27 +455,27 @@ void EAMT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); - + if (shared_types) { this->k_energy_fast.set_size(GX,BX); this->k_energy_fast.run(&this->atom->x, &type2rhor_z2r, &type2frho, - &rhor_spline2, &frho_spline1,&frho_spline2, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &rhor_spline2, &frho_spline1,&frho_spline2, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, &this->ans->engv, &eflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_rhomax, &_nrho, &_nr, &this->_threads_per_atom); } else { this->k_energy.set_size(GX,BX); this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho, - &rhor_spline2, &frho_spline1, &frho_spline2, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, + &rhor_spline2, &frho_spline1, &frho_spline2, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, &this->ans->engv,&eflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_rhomax, &_nrho, &_nr, &this->_threads_per_atom); @@ -501,25 +501,25 @@ void EAMT::loop2(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair2.start(); - + if (shared_types) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &_fp, &type2rhor_z2r, - &rhor_spline1, &z2r_spline1, &z2r_spline2, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &rhor_spline1, &z2r_spline1, &z2r_spline2, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &_cutforcesq, &_rdr, &_nr, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1, + this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1, &z2r_spline1, &z2r_spline2, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu index 054b3ca6db..13440b7d45 100644 --- a/lib/gpu/lal_eam.cu +++ b/lib/gpu/lal_eam.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov nguyentd@ornl.gov // ***************************************************************************/ @@ -82,7 +82,7 @@ texture z2r_sp2_tex; engv[ii]=energy; \ } \ } - + #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom, \ offset, elag, vflag, ans, engv) \ if (t_per_atom>1) { \ @@ -188,37 +188,37 @@ texture z2r_sp2_tex; #endif -__kernel void k_energy(const __global numtyp4 *restrict x_, +__kernel void k_energy(const __global numtyp4 *restrict x_, const __global int2 *restrict type2rhor_z2r, - const __global int *restrict type2frho, - const __global numtyp4 *restrict rhor_spline2, + const __global int *restrict type2frho, + const __global numtyp4 *restrict rhor_spline2, const __global numtyp4 *restrict frho_spline1, const __global numtyp4 *restrict frho_spline2, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global numtyp *restrict fp_, - __global acctyp *restrict engv, + __global numtyp *restrict fp_, + __global acctyp *restrict engv, const int eflag, const int inum, const int nbor_pitch, - const int ntypes, const numtyp cutforcesq, - const numtyp rdr, const numtyp rdrho, + const int ntypes, const numtyp cutforcesq, + const numtyp rdr, const numtyp rdrho, const numtyp rhomax, const int nrho, const int nr, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + acctyp rho = (acctyp)0; acctyp energy = (acctyp)0; - + if (ii { public: EAM(); ~EAM(); - + /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -41,11 +41,11 @@ class EAM : public BaseAtomic { int init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, double rdr, - double rdrho, double rhomax, int nrhor, int nrho, int nz2r, - int nfrho, int nr, const int nlocal, const int nall, + double rdrho, double rhomax, int nrhor, int nrho, int nz2r, + int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen); - + // Copy charges to device asynchronously inline void add_fp_data() { int nghost=this->atom->nall()-_nlocal; @@ -57,7 +57,7 @@ class EAM : public BaseAtomic { ucl_copy(dev_view,host_view,nghost,true); } } - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -67,7 +67,7 @@ class EAM : public BaseAtomic { /// Total host memory used by library for pair style double host_memory_usage() const; - + /// Pair loop with host neighboring void compute(const int f_ago, const int inum_full, const int, const int nall, double **host_x, int *host_type, int *ilist, int *numj, @@ -75,23 +75,23 @@ class EAM : public BaseAtomic { const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, void **fp_ptr); - + /// Pair loop with device neighboring int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, int &inum, void **fp_ptr); /// Pair loop with host neighboring - void compute2(int *ilist, const bool eflag, const bool vflag, + void compute2(int *ilist, const bool eflag, const bool vflag, const bool eatom, const bool vatom); - + // ------------------------- DEVICE KERNELS ------------------------- UCL_Kernel k_energy, k_energy_fast; - + // --------------------------- TEXTURES ----------------------------- UCL_Texture fp_tex; UCL_Texture rhor_spline1_tex, rhor_spline2_tex; @@ -99,37 +99,37 @@ class EAM : public BaseAtomic { UCL_Texture z2r_spline1_tex, z2r_spline2_tex; // --------------------------- DEVICE DATA -------------------------- - + /// Device Timers UCL_Timer time_pair2, time_fp1, time_fp2; - + // --------------------------- TYPE DATA -------------------------- - + UCL_D_Vec type2rhor_z2r; UCL_D_Vec type2frho; - + UCL_D_Vec z2r_spline1, z2r_spline2; UCL_D_Vec frho_spline1, frho_spline2; UCL_D_Vec rhor_spline1, rhor_spline2; - + numtyp _cutforcesq,_rdr,_rdrho, _rhomax; - + int _nfrho,_nrhor,_nrho,_nz2r,_nr; - + /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - - /// Number of atom types + + /// Number of atom types int _ntypes; - + int _max_fp_size; - + /// True of energy kernels are compiled bool _compiled_energy; - + /// Per-atom arrays UCL_Vector _fp; - + protected: bool _allocated; int _nlocal; diff --git a/lib/gpu/lal_eam_alloy_ext.cpp b/lib/gpu/lal_eam_alloy_ext.cpp index 282f93afeb..9209ed5c26 100644 --- a/lib/gpu/lal_eam_alloy_ext.cpp +++ b/lib/gpu/lal_eam_alloy_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ @@ -27,14 +27,14 @@ static EAM EAMALMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, +int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, - double rdr, double rdrho, double rhomax, int nrhor, - int nrho, int nz2r, int nfrho, int nr, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + double rdr, double rdrho, double rhomax, int nrhor, + int nrho, int nz2r, int nfrho, int nr, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, int &fp_size) { EAMALMF.clear(); gpu_mode=EAMALMF.device->gpu_mode(); @@ -46,11 +46,11 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, int procs_per_gpu=EAMALMF.device->procs_per_gpu(); // disable host/device split for now - if (gpu_split != 1.0) + if (gpu_split != 1.0) return -8; - + fp_size=sizeof(PRECISION); - + EAMALMF.device->init_message(screen,"eam/alloy",first_gpu,last_gpu); bool message=false; @@ -66,7 +66,7 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, if (world_me==0) init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -86,12 +86,12 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, if (gpu_rank==i && world_me!=0) init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); EAMALMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -108,7 +108,7 @@ void eam_alloy_gpu_clear() { int ** eam_alloy_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -117,10 +117,10 @@ int ** eam_alloy_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, inum, fp_ptr); -} +} -void eam_alloy_gpu_compute(const int ago, const int inum_full, const int nlocal, - const int nall, double **host_x, int *host_type, +void eam_alloy_gpu_compute(const int ago, const int inum_full, const int nlocal, + const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp index d56f750e2f..1b5602f808 100644 --- a/lib/gpu/lal_eam_ext.cpp +++ b/lib/gpu/lal_eam_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ @@ -27,14 +27,14 @@ static EAM EAMMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int eam_gpu_init(const int ntypes, double host_cutforcesq, +int eam_gpu_init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, - double rdr, double rdrho, double rhomax, int nrhor, - int nrho, int nz2r, int nfrho, int nr, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + double rdr, double rdrho, double rhomax, int nrhor, + int nrho, int nz2r, int nfrho, int nr, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, int &fp_size) { EAMMF.clear(); gpu_mode=EAMMF.device->gpu_mode(); @@ -46,11 +46,11 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, int procs_per_gpu=EAMMF.device->procs_per_gpu(); // disable host/device split for now - if (gpu_split != 1.0) + if (gpu_split != 1.0) return -8; - + fp_size=sizeof(PRECISION); - + EAMMF.device->init_message(screen,"eam",first_gpu,last_gpu); bool message=false; @@ -66,7 +66,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, if (world_me==0) init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -86,12 +86,12 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, if (gpu_rank==i && world_me!=0) init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); EAMMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -108,7 +108,7 @@ void eam_gpu_clear() { int ** eam_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -117,10 +117,10 @@ int ** eam_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, inum, fp_ptr); -} +} -void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, - const int nall, double **host_x, int *host_type, +void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, + const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, diff --git a/lib/gpu/lal_eam_fs_ext.cpp b/lib/gpu/lal_eam_fs_ext.cpp index 4992f3ab98..b9e25466aa 100644 --- a/lib/gpu/lal_eam_fs_ext.cpp +++ b/lib/gpu/lal_eam_fs_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ @@ -27,14 +27,14 @@ static EAM EAMFSMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, +int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, - double rdr, double rdrho, double rhomax, int nrhor, - int nrho, int nz2r, int nfrho, int nr, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + double rdr, double rdrho, double rhomax, int nrhor, + int nrho, int nz2r, int nfrho, int nr, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, int &fp_size) { EAMFSMF.clear(); gpu_mode=EAMFSMF.device->gpu_mode(); @@ -46,11 +46,11 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, int procs_per_gpu=EAMFSMF.device->procs_per_gpu(); // disable host/device split for now - if (gpu_split != 1.0) + if (gpu_split != 1.0) return -8; - + fp_size=sizeof(PRECISION); - + EAMFSMF.device->init_message(screen,"eam/fs",first_gpu,last_gpu); bool message=false; @@ -66,7 +66,7 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, if (world_me==0) init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -86,12 +86,12 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, if (gpu_rank==i && world_me!=0) init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); EAMFSMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -108,7 +108,7 @@ void eam_fs_gpu_clear() { int ** eam_fs_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -117,10 +117,10 @@ int ** eam_fs_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, inum, fp_ptr); -} +} -void eam_fs_gpu_compute(const int ago, const int inum_full, const int nlocal, - const int nall, double **host_x, int *host_type, +void eam_fs_gpu_compute(const int ago, const int inum_full, const int nlocal, + const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h index b33f087212..51f785b905 100644 --- a/lib/gpu/lal_ellipsoid_extra.h +++ b/lib/gpu/lal_ellipsoid_extra.h @@ -245,8 +245,8 @@ ucl_inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans) ucl_inline numtyp gpu_det3(const numtyp m[9]) { - numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - - m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + + numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - + m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + m[6]*m[1]*m[5] - m[6]*m[2]*m[4]; return ans; }; @@ -255,7 +255,7 @@ ucl_inline numtyp gpu_det3(const numtyp m[9]) diagonal matrix times a full matrix ------------------------------------------------------------------------- */ -ucl_inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], +ucl_inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], numtyp ans[9]) { ans[0] = shape.x*m[0]; @@ -421,7 +421,7 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans, t = aug[9]/aug[5]; aug[10]-=t*aug[6]; aug[11]-=t*aug[7]; - + if (aug[10] == (numtyp)0.0) *error_flag=2; @@ -440,11 +440,11 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans, quat = [w i j k] ------------------------------------------------------------------------- */ -ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, +ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, numtyp mat[9]) { numtyp4 q; fetch4(q,qi,quat_tex); - + numtyp w2 = q.x*q.x; numtyp i2 = q.y*q.y; numtyp j2 = q.z*q.z; @@ -561,7 +561,7 @@ ucl_inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9]) ------------------------------------------------------------------------- */ ucl_inline void gpu_times_column3(const numtyp m[9], const numtyp v[3], - numtyp ans[3]) + numtyp ans[3]) { ans[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2]; ans[1] = m[3]*v[0] + m[4]*v[1] + m[5]*v[2]; diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu index 30d864aecc..cac77f5dd3 100644 --- a/lib/gpu/lal_ellipsoid_nbor.cu +++ b/lib/gpu/lal_ellipsoid_nbor.cu @@ -29,14 +29,14 @@ texture pos_tex; // -- Only unpack neighbors matching the specified inclusive range of forms // -- Only unpack neighbors within cutoff // --------------------------------------------------------------------------- -__kernel void kernel_nbor(const __global numtyp4 *restrict x_, - const __global numtyp2 *restrict cut_form, - const int ntypes, +__kernel void kernel_nbor(const __global numtyp4 *restrict x_, + const __global numtyp2 *restrict cut_form, + const int ntypes, __global int *dev_nbor, - const int nbor_pitch, const int start, const int inum, - const __global int *dev_ij, + const int nbor_pitch, const int start, const int inum, + const __global int *dev_ij, const int form_low, const int form_high) { - + // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X+start; @@ -47,11 +47,11 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_, nbor+=nbor_pitch; int nbor_end=nbor+fast_mul(numj,nbor_pitch); int packed=ii+nbor_pitch+nbor_pitch; - + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul(iw,ntypes); - int newj=0; + int newj=0; for ( ; nbor=form_low && form[mtype]<=form_high) { // Compute r12; numtyp rsq=jx.x-ix.x; diff --git a/lib/gpu/lal_gauss.cpp b/lib/gpu/lal_gauss.cpp index 342ec4ecda..ef1559c5b6 100644 --- a/lib/gpu/lal_gauss.cpp +++ b/lib/gpu/lal_gauss.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,19 +33,19 @@ GaussT::Gauss() : BaseAtomic(), _allocated(false) { } template -GaussT::~Gauss() { +GaussT::~Gauss() { clear(); } - + template int GaussT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int GaussT::init(const int ntypes, - double **host_cutsq, double **host_a, - double **host_b, double **host_offset, +int GaussT::init(const int ntypes, + double **host_cutsq, double **host_a, + double **host_b, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -94,10 +94,10 @@ void GaussT::reinit(const int ntypes, double **host_cutsq, double **host_a, // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,gauss1,host_write,host_a,host_b, host_cutsq,host_offset); } @@ -135,7 +135,7 @@ void GaussT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu index 6accf36a06..98e71ea413 100644 --- a/lib/gpu/lal_gauss.cu +++ b/lib/gpu/lal_gauss.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,14 +24,14 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_gauss(const __global numtyp4 *restrict x_, +__kernel void k_gauss(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict gauss1, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -49,20 +49,20 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { - numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - + numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - gauss1[mtype].w); - energy+=factor_lj*e; + energy+=factor_lj*e; } if (vflag>0) { virial[0] += delx*delx*force; @@ -108,18 +108,18 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_gauss_fast(const __global numtyp4 *restrict x_, +__kernel void k_gauss_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict gauss1_in, - const __global numtyp *restrict sp_lj_in, + const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -127,7 +127,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_, if (tid0) { - numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - + numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - gauss1[mtype].w); - energy+=factor_lj*e; + energy+=factor_lj*e; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_gauss.h b/lib/gpu/lal_gauss.h index 1fd58adae5..d023310c6d 100644 --- a/lib/gpu/lal_gauss.h +++ b/lib/gpu/lal_gauss.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Gauss : public BaseAtomic { public: Gauss(); - ~Gauss(); + ~Gauss(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -38,16 +38,16 @@ class Gauss : public BaseAtomic { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double **host_a, double **host_b, double **host_offset, + double **host_a, double **host_b, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_cutsq, double **host_a, double **host_b, double **host_offset); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -68,7 +68,7 @@ class Gauss : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp index 7c15a12591..834c03cf64 100644 --- a/lib/gpu/lal_gauss_ext.cpp +++ b/lib/gpu/lal_gauss_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -27,9 +27,9 @@ static Gauss GLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, - double **host_b, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, +int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, + double **host_b, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { GLMF.clear(); @@ -54,7 +54,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, int init_ok=0; if (world_me==0) - init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, + init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -77,7 +77,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, cell_size, gpu_split, screen); GLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -96,16 +96,16 @@ void gauss_gpu_reinit(const int ntypes, double **cutsq, double **host_a, int world_me=GLMF.device->world_me(); int gpu_rank=GLMF.device->gpu_rank(); int procs_per_gpu=GLMF.device->procs_per_gpu(); - + if (world_me==0) GLMF.reinit(ntypes, cutsq, host_a, host_b, offset); - + GLMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } } @@ -124,7 +124,7 @@ int ** gauss_gpu_compute_n(const int ago, const int inum_full, return GLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void gauss_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_gayberne.cpp b/lib/gpu/lal_gayberne.cpp index 1d38810ae8..5abef659b6 100644 --- a/lib/gpu/lal_gayberne.cpp +++ b/lib/gpu/lal_gayberne.cpp @@ -37,21 +37,21 @@ GayBerneT::GayBerne() : BaseEllipsoid(), } template -GayBerneT::~GayBerne() { +GayBerneT::~GayBerne() { clear(); } - + template int GayBerneT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom(max_nbors); } template -int GayBerneT::init(const int ntypes, const double gamma, - const double upsilon, const double mu, - double **host_shape, double **host_well, - double **host_cutsq, double **host_sigma, - double **host_epsilon, double *host_lshape, +int GayBerneT::init(const int ntypes, const double gamma, + const double upsilon, const double mu, + double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, + double **host_epsilon, double *host_lshape, int **h_form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, const double *host_special_lj, @@ -100,11 +100,11 @@ int GayBerneT::init(const int ntypes, const double gamma, dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY); dev_error.zero(); - + // Allocate, cast and asynchronous memcpy of constant data // Copy data for bonded interactions gamma_upsilon_mu.alloc(7,*(this->ucl_device),UCL_READ_ONLY); - host_write[0]=static_cast(gamma); + host_write[0]=static_cast(gamma); host_write[1]=static_cast(upsilon); host_write[2]=static_cast(mu); host_write[3]=static_cast(host_special_lj[0]); @@ -117,7 +117,7 @@ int GayBerneT::init(const int ntypes, const double gamma, UCL_H_Vec d_view; d_view.view(host_lshape,lshape.numel(),*(this->ucl_device)); ucl_copy(lshape,d_view,false); - + // Copy shape, well, sigma, epsilon, and cutsq onto GPU // - cast if necessary shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); @@ -138,7 +138,7 @@ int GayBerneT::init(const int ntypes, const double gamma, } view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device)); ucl_copy(well,view4,false); - + _allocated=true; this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+ lj1.row_bytes()+lj3.row_bytes()+gamma_upsilon_mu.row_bytes()+ @@ -155,7 +155,7 @@ void GayBerneT::clear() { UCL_H_Vec err_flag(1,*(this->ucl_device)); ucl_copy(err_flag,dev_error,false); if (err_flag[0] == 2) - std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; + std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; err_flag.clear(); _allocated=false; @@ -170,7 +170,7 @@ void GayBerneT::clear() { well.clear(); lshape.clear(); gamma_upsilon_mu.clear(); - + this->clear_base(); } @@ -196,7 +196,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=0, NGX; int stride=this->nbor->nbor_pitch(); int ainum=this->ans->inum(); @@ -214,12 +214,12 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { this->time_ellipsoid.start(); this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->gamma_upsilon_mu, - &this->sigma_epsilon, &this->_lj_types, - &this->lshape, &this->nbor->dev_nbor, &stride, + &this->sigma_epsilon, &this->_lj_types, + &this->lshape, &this->nbor->dev_nbor, &stride, &this->ans->force, &ainum, &this->ans->engv, - &this->dev_error, &eflag, &vflag, + &this->dev_error, &eflag, &vflag, &this->_last_ellipse, &this->_threads_per_atom); this->time_ellipsoid.stop(); @@ -248,12 +248,12 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { this->time_ellipsoid2.start(); this->k_sphere_ellipsoid.set_size(GX,BX); this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat, - &this->shape, &this->well, - &this->gamma_upsilon_mu, - &this->sigma_epsilon, &this->_lj_types, - &this->lshape, &this->nbor->dev_nbor, - &stride, &this->ans->force, - &this->ans->engv, &this->dev_error, + &this->shape, &this->well, + &this->gamma_upsilon_mu, + &this->sigma_epsilon, &this->_lj_types, + &this->lshape, &this->nbor->dev_nbor, + &stride, &this->ans->force, + &this->ans->engv, &this->dev_error, &eflag, &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); this->time_ellipsoid2.stop(); @@ -264,28 +264,28 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { this->ans->force.zero(); this->ans->engv.zero(); this->time_nbor1.stop(); - this->time_ellipsoid.start(); + this->time_ellipsoid.start(); this->time_ellipsoid.stop(); this->time_nbor2.start(); this->time_nbor2.stop(); this->time_ellipsoid2.start(); this->time_ellipsoid2.stop(); } - + // ------------ LJ --------------- this->time_lj.start(); if (this->_last_ellipseans->inum()) { if (this->_shared_types) { this->k_lj_fast.set_size(GX,BX); - this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3, - &this->gamma_upsilon_mu, &stride, + this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3, + &this->gamma_upsilon_mu, &stride, &this->nbor->dev_packed, &this->ans->force, - &this->ans->engv, &this->dev_error, &eflag, + &this->ans->engv, &this->dev_error, &eflag, &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); } else { this->k_lj.set_size(GX,BX); - this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3, + this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3, &this->_lj_types, &this->gamma_upsilon_mu, &stride, &this->nbor->dev_packed, &this->ans->force, &this->ans->engv, &this->dev_error, &eflag, @@ -302,10 +302,10 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE, ELLIPSE_ELLIPSE,_shared_types,_lj_types); this->time_nbor1.stop(); - this->time_ellipsoid.start(); + this->time_ellipsoid.start(); this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, - &this->shape, &this->well, &this->gamma_upsilon_mu, + this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + &this->shape, &this->well, &this->gamma_upsilon_mu, &this->sigma_epsilon, &this->_lj_types, &this->lshape, &this->nbor->dev_nbor, &stride, &this->ans->force, &ainum, &this->ans->engv, &this->dev_error, diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu index 1a7e69eeba..71f29c2742 100644 --- a/lib/gpu/lal_gayberne.cu +++ b/lib/gpu/lal_gayberne.cu @@ -17,62 +17,62 @@ #include "lal_ellipsoid_extra.h" #endif -ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, +ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, numtyp ans[9]) { numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]- m[2]*m[6]*m[4]+m[1]*m[6]*m[5]- m[3]*m[1]*m[8]+m[0]*m[4]*m[8]; den = ucl_recip(den); - + ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]- m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+ m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]- m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+ m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den; - + ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+ (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]- (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]- m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+ m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den; - + ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]- m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]- m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+ (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+ m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den; - + ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+ m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+ m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]- m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]- m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den; - + ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+ (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]- (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+ m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]- m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den; - + ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]- m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+ (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+ m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]- (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den; - + ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+ (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+ m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]- m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]- m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den; - + ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]- (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+ (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]- m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+ m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den; - + ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]- m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]- m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+ @@ -82,28 +82,28 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape __kernel void k_gayberne(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict q, - const __global numtyp4 *restrict shape, - const __global numtyp4 *restrict well, - const __global numtyp *restrict gum, - const __global numtyp2 *restrict sig_eps, - const int ntypes, - const __global numtyp *restrict lshape, - const __global int *dev_nbor, - const int stride, - __global acctyp4 *restrict ans, - const int astride, - __global acctyp *restrict engv, - __global int *restrict err_flag, + const __global numtyp4 *restrict shape, + const __global numtyp4 *restrict well, + const __global numtyp *restrict gum, + const __global numtyp2 *restrict sig_eps, + const int ntypes, + const __global numtyp *restrict lshape, + const __global int *dev_nbor, + const int stride, + __global acctyp4 *restrict ans, + const int astride, + __global acctyp *restrict engv, + __global int *restrict err_flag, const int eflag, const int vflag, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; - sp_lj[0]=gum[3]; - sp_lj[1]=gum[4]; - sp_lj[2]=gum[5]; - sp_lj[3]=gum[6]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; acctyp energy=(acctyp)0; acctyp4 f; @@ -124,7 +124,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; numtyp a1[9], b1[9], g1[9]; @@ -159,7 +159,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, numtyp a2[9]; gpu_quat_to_mat_trans(q,j,a2); - + numtyp u_r, dUr[3], tUr[3], eta, teta[3]; { // Compute U_r, dUr, eta, and teta // Compute g12 @@ -173,7 +173,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, } { // Compute U_r and dUr - + // Compute kappa numtyp kappa[3]; gpu_mldivide3(g12,r12,kappa,err_flag); @@ -189,7 +189,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, kappa[2]*=ir; // energy - + // compute u_r and dUr numtyp uslj_rsq; { @@ -203,7 +203,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, kappa[0]*=r; kappa[1]*=r; kappa[2]*=r; - + int mtype=fast_mul(ntypes,itype)+jtype; numtyp sigma = sig_eps[mtype].x; numtyp epsilon = sig_eps[mtype].y; @@ -235,14 +235,14 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, } } } - + // Compute eta { eta = (numtyp)2.0*lshape[itype]*lshape[jtype]; numtyp det_g12 = gpu_det3(g12); eta = ucl_powr(eta/det_g12,gum[1]); } - + // Compute teta numtyp temp[9], tempv[3], tempv2[3]; compute_eta_torque(g12,a1,ishape,temp); @@ -255,7 +255,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, teta[0] = tempv2[0]; teta[1] = tempv2[1]; teta[2] = tempv2[2]; - + tempv[0] = temp1*temp[3]; tempv[1] = temp1*temp[4]; tempv[2] = temp1*temp[5]; @@ -272,7 +272,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, teta[1] += tempv2[1]; teta[2] += tempv2[2]; } - + numtyp chi, dchi[3], tchi[3]; { // Compute chi and dchi @@ -355,7 +355,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; - + } // for nbor store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag, vflag,ans,engv); diff --git a/lib/gpu/lal_gayberne.h b/lib/gpu/lal_gayberne.h index dacaf74282..8792f1f1db 100644 --- a/lib/gpu/lal_gayberne.h +++ b/lib/gpu/lal_gayberne.h @@ -25,14 +25,14 @@ template class GayBerne : public BaseEllipsoid { public: GayBerne(); - ~GayBerne(); + ~GayBerne(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device + * \param gpu_split fraction of particles handled by device * \return false if there is not sufficient memory or device init prob - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -41,18 +41,18 @@ class GayBerne : public BaseEllipsoid { * - -5 Double precision is not supported on card **/ int init(const int ntypes, const double gamma, const double upsilon, const double mu, double **host_shape, - double **host_well, double **host_cutsq, double **host_sigma, + double **host_well, double **host_cutsq, double **host_sigma, double **host_epsilon, double *host_lshape, int **h_form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - const double *host_special_lj, const int nlocal, const int nall, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + const double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); - + /// Returns memory usage on device per atom int bytes_per_atom(const int max_nbors) const; @@ -61,8 +61,8 @@ class GayBerne : public BaseEllipsoid { /// Device Error Flag - Set if a bad matrix inversion occurs UCL_D_Vec dev_error; - - // --------------------------- TYPE DATA -------------------------- + + // --------------------------- TYPE DATA -------------------------- /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form UCL_D_Vec lj1; @@ -72,12 +72,12 @@ class GayBerne : public BaseEllipsoid { UCL_D_Vec sigma_epsilon; // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ... UCL_D_Vec gamma_upsilon_mu; - + /// If atom type constants fit in shared memory, use fast kernels bool _shared_types; int _lj_types; - - // --------------------------- ATOM DATA -------------------------- + + // --------------------------- ATOM DATA -------------------------- /// Aspherical Const Data for Atoms UCL_D_Vec shape, well; diff --git a/lib/gpu/lal_gayberne_ext.cpp b/lib/gpu/lal_gayberne_ext.cpp index e674fb376b..451550e7ef 100644 --- a/lib/gpu/lal_gayberne_ext.cpp +++ b/lib/gpu/lal_gayberne_ext.cpp @@ -33,7 +33,7 @@ int gb_gpu_init(const int ntypes, const double gamma, double **epsilon, double *host_lshape, int **form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { GBMF.clear(); @@ -58,16 +58,16 @@ int gb_gpu_init(const int ntypes, const double gamma, int init_ok=0; if (world_me==0) - init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, - sigma, epsilon, host_lshape, form, host_lj1, - host_lj2, host_lj3, host_lj4, offset, special_lj, + init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, + sigma, epsilon, host_lshape, form, host_lj1, + host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); GBMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); - + for (int i=0; igpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -105,8 +105,8 @@ void gb_gpu_clear() { int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double **host_quat); @@ -117,8 +117,8 @@ int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat) { - return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, - tag, nspecial, special, eflag, vflag, eatom, vatom, + return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, + tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_quat); } diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu index 9b33b5f7f3..7925b72784 100644 --- a/lib/gpu/lal_gayberne_lj.cu +++ b/lib/gpu/lal_gayberne_lj.cu @@ -18,30 +18,30 @@ #endif __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict q, + const __global numtyp4 *restrict q, const __global numtyp4 *restrict shape, - const __global numtyp4 *restrict well, - const __global numtyp *restrict gum, + const __global numtyp4 *restrict well, + const __global numtyp *restrict gum, const __global numtyp2 *restrict sig_eps, - const int ntypes, + const int ntypes, const __global numtyp *restrict lshape, - const __global int *dev_nbor, + const __global int *dev_nbor, const int stride, - __global acctyp4 *restrict ans, + __global acctyp4 *restrict ans, __global acctyp *restrict engv, - __global int *restrict err_flag, + __global int *restrict err_flag, const int eflag, const int vflag, - const int start, const int inum, + const int start, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; __local numtyp sp_lj[4]; - sp_lj[0]=gum[3]; - sp_lj[1]=gum[4]; - sp_lj[2]=gum[5]; - sp_lj[3]=gum[6]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; acctyp energy=(acctyp)0; acctyp4 f; @@ -58,16 +58,16 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; - + numtyp oner=shape[itype].x; numtyp one_well=well[itype].x; - + numtyp factor_lj; for ( ; nbor0) { numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y); - energy+=factor_lj*(e-lj3[ii].z); + energy+=factor_lj*(e-lj3[ii].z); } if (vflag>0) { virial[0] += delx*delx*force; @@ -332,33 +332,33 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, - const __global numtyp *restrict gum, - const int stride, +__kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1_in, + const __global numtyp4 *restrict lj3_in, + const __global numtyp *restrict gum, + const int stride, const __global int *dev_ij, - __global acctyp4 *restrict ans, + __global acctyp4 *restrict ans, __global acctyp *restrict engv, - __global int *restrict err_flag, - const int eflag, const int vflag, - const int start, const int inum, + __global int *restrict err_flag, + const int eflag, const int vflag, + const int start, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; - __local numtyp sp_lj[4]; + __local numtyp sp_lj[4]; __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; if (tid<4) - sp_lj[tid]=gum[tid+3]; + sp_lj[tid]=gum[tid+3]; if (tid0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; @@ -367,9 +367,9 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp index 6c6e145319..2190e40516 100644 --- a/lib/gpu/lal_lj.cpp +++ b/lib/gpu/lal_lj.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -33,20 +33,20 @@ LJT::LJ() : BaseAtomic(), _allocated(false) { } template -LJT::~LJ() { +LJT::~LJ() { clear(); } - + template int LJT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int LJT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, +int LJT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -99,10 +99,10 @@ void LJT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, host_cutsq); this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, @@ -143,7 +143,7 @@ void LJT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -155,12 +155,12 @@ void LJT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu index 9569cb0fd7..5838ac95cf 100644 --- a/lib/gpu/lal_lj.cu +++ b/lib/gpu/lal_lj.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -24,15 +24,15 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_lj(const __global numtyp4 *restrict x_, +__kernel void k_lj(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -44,19 +44,19 @@ __kernel void k_lj(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; @@ -101,19 +101,19 @@ __kernel void k_lj(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, - const __global numtyp *restrict sp_lj_in, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const __global numtyp4 *restrict lj3_in, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; @@ -124,7 +124,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -133,7 +133,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_lj.h b/lib/gpu/lal_lj.h index 63a3e8a6c9..01ce85c8ea 100644 --- a/lib/gpu/lal_lj.h +++ b/lib/gpu/lal_lj.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class LJ : public BaseAtomic { public: LJ(); - ~LJ(); + ~LJ(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,15 +40,15 @@ class LJ : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -71,7 +71,7 @@ class LJ : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_lj96.cpp b/lib/gpu/lal_lj96.cpp index 70e46b9fe1..b59495c41a 100644 --- a/lib/gpu/lal_lj96.cpp +++ b/lib/gpu/lal_lj96.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -36,7 +36,7 @@ template LJ96T::~LJ96() { clear(); } - + template int LJ96T::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -44,9 +44,9 @@ int LJ96T::bytes_per_atom(const int max_nbors) const { template int LJ96T::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -126,7 +126,7 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -138,7 +138,7 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu index b219b8bf0d..3bb7750022 100644 --- a/lib/gpu/lal_lj96.cu +++ b/lib/gpu/lal_lj96.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -26,13 +26,13 @@ texture pos_tex; __kernel void k_lj96(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -50,20 +50,20 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; @@ -109,15 +109,15 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj96_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj96_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -132,30 +132,30 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_lj96.h b/lib/gpu/lal_lj96.h index 7d51e287d3..3fdea5265e 100644 --- a/lib/gpu/lal_lj96.h +++ b/lib/gpu/lal_lj96.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ class LJ96 : public BaseAtomic { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,8 +40,8 @@ class LJ96 : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -66,7 +66,7 @@ class LJ96 : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_lj96_ext.cpp b/lib/gpu/lal_lj96_ext.cpp index 14c32ef95e..c7ec9f4448 100644 --- a/lib/gpu/lal_lj96_ext.cpp +++ b/lib/gpu/lal_lj96_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -77,7 +77,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, cell_size, gpu_split, screen); LJ96MF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,7 +102,7 @@ int** lj96_gpu_compute_n(const int ago, const int inum_full, return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void lj96_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_lj_class2_long.cpp b/lib/gpu/lal_lj_class2_long.cpp index ef59843c4a..0109446b95 100644 --- a/lib/gpu/lal_lj_class2_long.cpp +++ b/lib/gpu/lal_lj_class2_long.cpp @@ -38,7 +38,7 @@ template LJClass2LongT::~LJClass2Long() { clear(); } - + template int LJClass2LongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -46,8 +46,8 @@ int LJClass2LongT::bytes_per_atom(const int max_nbors) const { template int LJClass2LongT::init(const int ntypes, double **host_cutsq, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -136,7 +136,7 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -145,11 +145,11 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, - &_cut_coulsq, &_qqrd2e, &_g_ewald, + &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu index e16de3a327..41ceca35d7 100644 --- a/lib/gpu/lal_lj_class2_long.cu +++ b/lib/gpu/lal_lj_class2_long.cu @@ -32,15 +32,15 @@ texture q_tex; __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, const __global numtyp4 *restrict lj3, - const int lj_types, + const int lj_types, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, - const __global numtyp *restrict q_, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; @@ -63,14 +63,14 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -147,20 +147,20 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const __global numtyp *restrict q_, - const numtyp cut_coulsq, + const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp g_ewald, + const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -175,7 +175,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -183,16 +183,16 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,8 +40,8 @@ class LJClass2Long : public BaseCharge { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); @@ -68,7 +68,7 @@ class LJClass2Long : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e, _g_ewald; diff --git a/lib/gpu/lal_lj_class2_long_ext.cpp b/lib/gpu/lal_lj_class2_long_ext.cpp index 4bb3aad7ad..fa3e95f1f2 100644 --- a/lib/gpu/lal_lj_class2_long_ext.cpp +++ b/lib/gpu/lal_lj_class2_long_ext.cpp @@ -82,7 +82,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); C2CLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -99,7 +99,7 @@ void c2cl_gpu_clear() { int** c2cl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -109,7 +109,7 @@ int** c2cl_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void c2cl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_lj_coul.cpp b/lib/gpu/lal_lj_coul.cpp index 8030f3cfc2..00a5c108d9 100644 --- a/lib/gpu/lal_lj_coul.cpp +++ b/lib/gpu/lal_lj_coul.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template LJCoulT::~LJCoul() { clear(); } - + template int LJCoulT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,9 +45,9 @@ int LJCoulT::bytes_per_atom(const int max_nbors) const { template int LJCoulT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -138,7 +138,7 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -149,14 +149,14 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu index 364203db22..5c7f0da46f 100644 --- a/lib/gpu/lal_lj_coul.cu +++ b/lib/gpu/lal_lj_coul.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -29,19 +29,19 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_lj_coul(const __global numtyp4 *restrict x_, +__kernel void k_lj_coul(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, + const __global numtyp4 *restrict lj3, + const int lj_types, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, - const __global numtyp *restrict q_, - const __global numtyp *restrict cutsq, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp *restrict cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -63,14 +63,14 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -140,16 +140,16 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_, __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const __global numtyp *restrict q_, - const __global numtyp *restrict _cutsq, + const __global numtyp *restrict _cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -166,7 +166,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -174,16 +174,16 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,7 +40,7 @@ class LJCoul : public BaseCharge { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, @@ -70,7 +70,7 @@ class LJCoul : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e; diff --git a/lib/gpu/lal_lj_coul_debye.cpp b/lib/gpu/lal_lj_coul_debye.cpp index 135a4dfd9d..1b230096a4 100644 --- a/lib/gpu/lal_lj_coul_debye.cpp +++ b/lib/gpu/lal_lj_coul_debye.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template LJCoulDebyeT::~LJCoulDebye() { clear(); } - + template int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,9 +45,9 @@ int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const { template int LJCoulDebyeT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -98,7 +98,7 @@ int LJCoulDebyeT::init(const int ntypes, _qqrd2e=qqrd2e; _kappa=kappa; - + _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+ sp_lj.row_bytes(); @@ -140,7 +140,7 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -157,9 +157,9 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) { } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->q, &cutsq, + &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &_kappa, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu index 308504c6c8..91b105b3da 100644 --- a/lib/gpu/lal_lj_coul_debye.cu +++ b/lib/gpu/lal_lj_coul_debye.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -29,19 +29,19 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_lj_debye(const __global numtyp4 *restrict x_, +__kernel void k_lj_debye(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp *restrict q_ , - const __global numtyp *restrict cutsq, + const __global numtyp *restrict cutsq, const numtyp qqrd2e, const numtyp kappa, const int t_per_atom) { int tid, ii, offset; @@ -64,14 +64,14 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -147,15 +147,15 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_, __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, - const __global numtyp *restrict q_, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, const __global numtyp *restrict _cutsq, const numtyp qqrd2e, const numtyp kappa, const int t_per_atom) { @@ -174,7 +174,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -182,16 +182,16 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,7 +40,7 @@ class LJCoulDebye : public BaseCharge { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, @@ -70,7 +70,7 @@ class LJCoulDebye : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e,_kappa; diff --git a/lib/gpu/lal_lj_coul_debye_ext.cpp b/lib/gpu/lal_lj_coul_debye_ext.cpp index 67f5a0075f..8ec189a764 100644 --- a/lib/gpu/lal_lj_coul_debye_ext.cpp +++ b/lib/gpu/lal_lj_coul_debye_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,7 +33,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, - double *host_special_coul, const double qqrd2e, + double *host_special_coul, const double qqrd2e, const double kappa) { LJCDMF.clear(); gpu_mode=LJCDMF.device->gpu_mode(); @@ -82,7 +82,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, qqrd2e, kappa); LJCDMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -99,7 +99,7 @@ void ljcd_gpu_clear() { int** ljcd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -109,7 +109,7 @@ int** ljcd_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void ljcd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_lj_coul_ext.cpp b/lib/gpu/lal_lj_coul_ext.cpp index 3b5cc09805..297ac7414e 100644 --- a/lib/gpu/lal_lj_coul_ext.cpp +++ b/lib/gpu/lal_lj_coul_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -81,7 +81,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, qqrd2e); LJCMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -98,7 +98,7 @@ void ljc_gpu_clear() { int** ljc_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -108,7 +108,7 @@ int** ljc_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void ljc_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_lj_coul_long.cpp b/lib/gpu/lal_lj_coul_long.cpp index 03f32a5fd0..71205af0ea 100644 --- a/lib/gpu/lal_lj_coul_long.cpp +++ b/lib/gpu/lal_lj_coul_long.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template LJCoulLongT::~LJCoulLong() { clear(); } - + template int LJCoulLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,9 +45,9 @@ int LJCoulLongT::bytes_per_atom(const int max_nbors) const { template int LJCoulLongT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -109,10 +109,10 @@ void LJCoulLongT::reinit(const int ntypes, double **host_cutsq, double **host_lj // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, host_cutsq, host_cut_ljsq); this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, @@ -153,7 +153,7 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -162,7 +162,7 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu index e0aa2e8a58..0e25bb2dbc 100644 --- a/lib/gpu/lal_lj_coul_long.cu +++ b/lib/gpu/lal_lj_coul_long.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -29,17 +29,17 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, +__kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { @@ -63,14 +63,14 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -145,14 +145,14 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp *restrict q_, @@ -171,7 +171,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -179,16 +179,16 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,8 +40,8 @@ class LJCoulLong : public BaseCharge { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); @@ -73,7 +73,7 @@ class LJCoulLong : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e, _g_ewald; diff --git a/lib/gpu/lal_lj_coul_long_ext.cpp b/lib/gpu/lal_lj_coul_long_ext.cpp index dc93365f22..95bd369336 100644 --- a/lib/gpu/lal_lj_coul_long_ext.cpp +++ b/lib/gpu/lal_lj_coul_long_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -82,7 +82,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); LJCLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,15 +102,15 @@ void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, int world_me=LJCLMF.device->world_me(); int gpu_rank=LJCLMF.device->gpu_rank(); int procs_per_gpu=LJCLMF.device->procs_per_gpu(); - + if (world_me==0) - LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, host_cut_ljsq); LJCLMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } @@ -122,7 +122,7 @@ void ljcl_gpu_clear() { int** ljcl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -132,7 +132,7 @@ int** ljcl_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void ljcl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_lj_coul_msm.cpp b/lib/gpu/lal_lj_coul_msm.cpp index dd045b7970..7559a93b90 100644 --- a/lib/gpu/lal_lj_coul_msm.cpp +++ b/lib/gpu/lal_lj_coul_msm.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template LJCoulMSMT::~LJCoulMSM() { clear(); } - + template int LJCoulMSMT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,8 +45,8 @@ int LJCoulMSMT::bytes_per_atom(const int max_nbors) const { template int LJCoulMSMT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, double **host_gcons, double **host_dgcons, double **host_offset, double *host_special_lj, const int nlocal, @@ -93,11 +93,11 @@ int LJCoulMSMT::init(const int ntypes, ncols = 7; UCL_H_Vec dview_gcons(nrows*ncols,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int ix=0; ixucl_device),UCL_READ_ONLY); ucl_copy(gcons,dview_gcons,false); gcons_tex.get_texture(*(this->pair_program),"gcons_tex"); @@ -107,11 +107,11 @@ int LJCoulMSMT::init(const int ntypes, ncols = 6; UCL_H_Vec dview_dgcons(nrows*ncols,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int ix=0; ixucl_device),UCL_READ_ONLY); ucl_copy(dgcons,dview_dgcons,false); dgcons_tex.get_texture(*(this->pair_program),"dgcons_tex"); @@ -170,7 +170,7 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -179,7 +179,7 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj, + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, diff --git a/lib/gpu/lal_lj_coul_msm.cu b/lib/gpu/lal_lj_coul_msm.cu index 0c7c3cdace..3f73c6f47d 100644 --- a/lib/gpu/lal_lj_coul_msm.cu +++ b/lib/gpu/lal_lj_coul_msm.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -80,19 +80,19 @@ ucl_inline numtyp dgamma(const numtyp rho, const int order, return ((numtyp)-1.0/rho/rho); } -__kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, +__kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, const __global numtyp4 *restrict lj3, const __global numtyp *restrict gcons, const __global numtyp *restrict dgcons, const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const int order, const int t_per_atom) { @@ -116,20 +116,20 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -199,7 +199,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict gcons, @@ -227,7 +227,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -235,16 +235,16 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -41,8 +41,8 @@ class LJCoulMSM : public BaseCharge { double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_gcons, double **host_dgcons, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const int order, const double qqrd2e); @@ -65,14 +65,14 @@ class LJCoulMSM : public BaseCharge { UCL_D_Vec lj3; /// Special LJ values [0-3] and Special Coul values [4-7] UCL_D_Vec sp_lj; - + UCL_D_Vec gcons, dgcons; UCL_Texture gcons_tex, dgcons_tex; - + /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e; diff --git a/lib/gpu/lal_lj_coul_msm_ext.cpp b/lib/gpu/lal_lj_coul_msm_ext.cpp index ecf3254cf9..ceff1f7c66 100644 --- a/lib/gpu/lal_lj_coul_msm_ext.cpp +++ b/lib/gpu/lal_lj_coul_msm_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -84,7 +84,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, order, qqrd2e); LJCMLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -101,7 +101,7 @@ void ljcm_gpu_clear() { int** ljcm_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -111,7 +111,7 @@ int** ljcm_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void ljcm_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_lj_cubic.cpp b/lib/gpu/lal_lj_cubic.cpp index 25f83166e1..933795a8f6 100644 --- a/lib/gpu/lal_lj_cubic.cpp +++ b/lib/gpu/lal_lj_cubic.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndactrung@gmail.com ***************************************************************************/ @@ -33,21 +33,21 @@ LJCubicT::LJCubic() : BaseAtomic(), _allocated(false) { } template -LJCubicT::~LJCubic() { +LJCubicT::~LJCubic() { clear(); } - + template int LJCubicT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int LJCubicT::init(const int ntypes, +int LJCubicT::init(const int ntypes, double **host_cutsq, double **host_cut_inner_sq, - double **host_cut_inner, double **host_sigma, - double **host_epsilon, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, + double **host_cut_inner, double **host_sigma, + double **host_epsilon, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -132,7 +132,7 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -144,12 +144,12 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &lj1, &lj2, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj2, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &lj1, &lj2, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_lj_cubic.cu b/lib/gpu/lal_lj_cubic.cu index 420689383f..a4b1992f33 100644 --- a/lib/gpu/lal_lj_cubic.cu +++ b/lib/gpu/lal_lj_cubic.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : ndactrung@gmail.com // ***************************************************************************/ @@ -31,16 +31,16 @@ texture pos_tex; #define _DPHIDS (numtyp)2.6899009 // gradient at s #define _A3 (numtyp)27.93357 // cubic coefficient -__kernel void k_lj_cubic(const __global numtyp4 *restrict x_, +__kernel void k_lj_cubic(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, const __global numtyp4 *restrict lj2, - const __global numtyp2 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + const __global numtyp2 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -52,19 +52,19 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e; - if (rsq <= lj2[mtype].x) + if (rsq <= lj2[mtype].x) e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); else e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0); - energy+=factor_lj*e; + energy+=factor_lj*e; } if (vflag>0) { virial[0] += delx*delx*force; @@ -122,20 +122,20 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, const __global numtyp4 *restrict lj2_in, - const __global numtyp2 *restrict lj3_in, - const __global numtyp *restrict sp_lj_in, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const __global numtyp2 *restrict lj3_in, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp2 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; @@ -148,7 +148,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -157,7 +157,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { numtyp e; - if (rsq <= lj2[mtype].x) + if (rsq <= lj2[mtype].x) e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); else e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0); - energy+=factor_lj*e; + energy+=factor_lj*e; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_lj_cubic.h b/lib/gpu/lal_lj_cubic.h index 0fefc727eb..818fb3581b 100644 --- a/lib/gpu/lal_lj_cubic.h +++ b/lib/gpu/lal_lj_cubic.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndactrung@gmail.com ***************************************************************************/ @@ -24,13 +24,13 @@ template class LJCubic : public BaseAtomic { public: LJCubic(); - ~LJCubic(); + ~LJCubic(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -39,11 +39,11 @@ class LJCubic : public BaseAtomic { * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, double **host_cut_inner_sq, double **host_cut_inner, double **host_sigma, double **host_epsilon, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -68,7 +68,7 @@ class LJCubic : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_lj_cubic_ext.cpp b/lib/gpu/lal_lj_cubic_ext.cpp index 518f706781..a45d02a8ca 100644 --- a/lib/gpu/lal_lj_cubic_ext.cpp +++ b/lib/gpu/lal_lj_cubic_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndactrung@gmail.com ***************************************************************************/ @@ -27,11 +27,11 @@ static LJCubic LJCubicLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq, +int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq, double **cut_inner, double **sigma, double **epsilon, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double *special_lj, - const int inum, const int nall, const int max_nbors, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double *special_lj, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { LJCubicLMF.clear(); @@ -81,7 +81,7 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq, cell_size, gpu_split, screen); LJCubicLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -106,7 +106,7 @@ int ** ljcb_gpu_compute_n(const int ago, const int inum_full, return LJCubicLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void ljcb_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_lj_dsf.cpp b/lib/gpu/lal_lj_dsf.cpp index 1b8fdeabb0..384cf75d1f 100644 --- a/lib/gpu/lal_lj_dsf.cpp +++ b/lib/gpu/lal_lj_dsf.cpp @@ -37,22 +37,22 @@ template LJDSFT::~LJDSF() { clear(); } - + template int LJDSFT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1, +int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, - double **host_offset, double *host_special_lj, + double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, - const double e_shift, const double f_shift, + const double e_shift, const double f_shift, const double alpha) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -138,7 +138,7 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -149,15 +149,15 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, &this->_threads_per_atom); diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu index 5e0cd4aca9..323576fe77 100644 --- a/lib/gpu/lal_lj_dsf.cu +++ b/lib/gpu/lal_lj_dsf.cu @@ -31,20 +31,20 @@ texture q_tex; #define MY_PIS (acctyp)1.77245385090551602729 -__kernel void k_lj_dsf(const __global numtyp4 *restrict x_, +__kernel void k_lj_dsf(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_ , const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp e_shift, const numtyp f_shift, + const numtyp e_shift, const numtyp f_shift, const numtyp alpha, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -66,20 +66,20 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * + acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } @@ -119,7 +119,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_, numtyp erfcd = ucl_exp(-alpha*alpha*rsq); numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r); erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd; - forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + + forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + rsq*f_shift-factor_coul); } else forcecoul = (numtyp)0.0; @@ -156,19 +156,19 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp e_shift, const numtyp f_shift, + const numtyp e_shift, const numtyp f_shift, const numtyp alpha, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -183,7 +183,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -191,23 +191,23 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * + acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } @@ -246,7 +246,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, numtyp erfcd = ucl_exp(-alpha*alpha*rsq); numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r); erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd; - forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + + forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + rsq*f_shift-factor_coul); } else forcecoul = (numtyp)0.0; diff --git a/lib/gpu/lal_lj_dsf.h b/lib/gpu/lal_lj_dsf.h index 5badf543c4..0195898ca4 100644 --- a/lib/gpu/lal_lj_dsf.h +++ b/lib/gpu/lal_lj_dsf.h @@ -30,7 +30,7 @@ class LJDSF : public BaseCharge { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,11 +40,11 @@ class LJDSF : public BaseCharge { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double e_shift, const double f_shift, + const double qqrd2e, const double e_shift, const double f_shift, const double alpha); /// Clear all host and device data @@ -69,7 +69,7 @@ class LJDSF : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e; diff --git a/lib/gpu/lal_lj_dsf_ext.cpp b/lib/gpu/lal_lj_dsf_ext.cpp index 719a792d7f..f516da6622 100644 --- a/lib/gpu/lal_lj_dsf_ext.cpp +++ b/lib/gpu/lal_lj_dsf_ext.cpp @@ -34,7 +34,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, - const double e_shift, const double f_shift, + const double e_shift, const double f_shift, const double alpha) { LJDMF.clear(); gpu_mode=LJDMF.device->gpu_mode(); @@ -85,7 +85,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, f_shift, alpha); LJDMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,7 +102,7 @@ void ljd_gpu_clear() { int** ljd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -112,7 +112,7 @@ int** ljd_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} +} void ljd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_lj_expand.cpp b/lib/gpu/lal_lj_expand.cpp index 03526bc095..c6d8a92e96 100644 --- a/lib/gpu/lal_lj_expand.cpp +++ b/lib/gpu/lal_lj_expand.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ibains@nvidia.com ***************************************************************************/ @@ -36,7 +36,7 @@ template LJExpandT::~LJExpand() { clear(); } - + template int LJExpandT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -97,17 +97,17 @@ void LJExpandT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double **host_shift) { - + // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, host_cutsq, host_shift); - + this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, host_offset); } @@ -146,7 +146,7 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -155,15 +155,15 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu index 6b79db2323..a951b4107a 100644 --- a/lib/gpu/lal_lj_expand.cu +++ b/lib/gpu/lal_lj_expand.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : ibains@nvidia.com // ***************************************************************************/ @@ -26,15 +26,15 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_lj_expand(const __global numtyp4 *restrict x_, +__kernel void k_lj_expand(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -52,20 +52,20 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; @@ -113,15 +113,15 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global numtyp4 *restrict lj3_in, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -136,30 +136,30 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(numtyp)0; - + __syncthreads(); - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_lj_expand.h b/lib/gpu/lal_lj_expand.h index 0d0ae0b2e6..a732a3a686 100644 --- a/lib/gpu/lal_lj_expand.h +++ b/lib/gpu/lal_lj_expand.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ibains@nvidia.com ***************************************************************************/ @@ -30,7 +30,7 @@ class LJExpand : public BaseAtomic { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,15 +40,15 @@ class LJExpand : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double **host_shift, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double **host_shift); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -71,7 +71,7 @@ class LJExpand : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp index 5303149d1f..d6ea4a9200 100644 --- a/lib/gpu/lal_lj_expand_ext.cpp +++ b/lib/gpu/lal_lj_expand_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ibains@nvidia.com ***************************************************************************/ @@ -30,7 +30,7 @@ static LJExpand LJEMF; int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double **shift, double *special_lj, - const int inum, const int nall, const int max_nbors, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { LJEMF.clear(); @@ -78,7 +78,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1, cell_size, gpu_split,screen); LJEMF.device->world_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -98,12 +98,12 @@ int lje_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, int world_me=LJEMF.device->world_me(); int gpu_rank=LJEMF.device->gpu_rank(); int procs_per_gpu=LJEMF.device->procs_per_gpu(); - + if (world_me==0) LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, shift); LJEMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -97,11 +97,11 @@ void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, int world_me=LJLMF.device->world_me(); int gpu_rank=LJLMF.device->gpu_rank(); int procs_per_gpu=LJLMF.device->procs_per_gpu(); - + if (world_me==0) LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset); LJLMF.device->world_barrier(); - + for (int i=0; i LJGROMACST::~LJGROMACS() { clear(); } - + template int LJGROMACST::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -47,11 +47,11 @@ template int LJGROMACST::init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, - double **host_ljsw4, double **host_ljsw5, + double **host_ljsw4, double **host_ljsw5, double **cut_inner, double **cut_inner_sq) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -134,7 +134,7 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -146,16 +146,16 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &_lj_types, + this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &_lj_types, &sp_lj, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, + &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_lj_gromacs.cu b/lib/gpu/lal_lj_gromacs.cu index f20d8634a5..93dc3d9456 100644 --- a/lib/gpu/lal_lj_gromacs.cu +++ b/lib/gpu/lal_lj_gromacs.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -35,8 +35,8 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -59,7 +59,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; @@ -83,7 +83,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_, if (rsq lj1[mtype].w) { @@ -91,7 +91,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_, t = r - lj3[mtype].z; numtyp fswitch = r*t*t*(ljsw[mtype].x + ljsw[mtype].y*t); force_lj += fswitch; - } + } force = factor_lj*force_lj * r2inv; @@ -149,22 +149,22 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_, lj3[tid]=lj3_in[tid]; ljsw[tid]=ljsw_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii lj1[mtype].w) { @@ -196,7 +196,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_, t = r - lj3[mtype].z; numtyp fswitch = r*t*t*(ljsw[mtype].x + ljsw[mtype].y*t); force_lj += fswitch; - } + } force = factor_lj*force_lj * r2inv; diff --git a/lib/gpu/lal_lj_gromacs.h b/lib/gpu/lal_lj_gromacs.h index dc949be4a9..1e0f72dafc 100644 --- a/lib/gpu/lal_lj_gromacs.h +++ b/lib/gpu/lal_lj_gromacs.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ class LJGROMACS : public BaseAtomic { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,11 +40,11 @@ class LJGROMACS : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, - double **host_ljsw4, double **host_ljsw5, + double **host_ljsw4, double **host_ljsw5, double **cut_inner, double **cut_inner_sq); /// Clear all host and device data @@ -71,7 +71,7 @@ class LJGROMACS : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_lj_gromacs_ext.cpp b/lib/gpu/lal_lj_gromacs_ext.cpp index b5eb0038b7..83f0ffc403 100644 --- a/lib/gpu/lal_lj_gromacs_ext.cpp +++ b/lib/gpu/lal_lj_gromacs_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,7 +33,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, - double **host_ljsw4, double **host_ljsw5, + double **host_ljsw4, double **host_ljsw5, double **cut_inner, double **cut_inner_sq) { LJGRMMF.clear(); gpu_mode=LJGRMMF.device->gpu_mode(); @@ -59,7 +59,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, if (world_me==0) LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, special_lj, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, + gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq); LJGRMMF.device->world_barrier(); @@ -78,11 +78,11 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, if (gpu_rank==i && world_me!=0) init_ok=LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, special_lj, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, + gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq); LJGRMMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -107,7 +107,7 @@ int ** ljgrm_gpu_compute_n(const int ago, const int inum_full, return LJGRMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void ljgrm_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_mie.cpp b/lib/gpu/lal_mie.cpp index 2ab7cb8d14..a87771e9bb 100644 --- a/lib/gpu/lal_mie.cpp +++ b/lib/gpu/lal_mie.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,17 +33,17 @@ MieT::Mie() : BaseAtomic(), _allocated(false) { } template -MieT::~Mie() { +MieT::~Mie() { clear(); } - + template int MieT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int MieT::init(const int ntypes, double **host_cutsq, +int MieT::init(const int ntypes, double **host_cutsq, double **host_mie1, double **host_mie2, double **host_mie3, double **host_mie4, double **host_gamA, double **host_gamR, @@ -81,7 +81,7 @@ int MieT::init(const int ntypes, double **host_cutsq, mie3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,mie3,host_write,host_mie3,host_mie4, host_offset,host_cutsq); - + UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); dview.view(host_special_lj,4,*(this->ucl_device)); @@ -126,7 +126,7 @@ void MieT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); diff --git a/lib/gpu/lal_mie.cu b/lib/gpu/lal_mie.cu index 4d718897eb..33018566eb 100644 --- a/lib/gpu/lal_mie.cu +++ b/lib/gpu/lal_mie.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,15 +24,15 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_mie(const __global numtyp4 *restrict x_, +__kernel void k_mie(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict mie1, const __global numtyp4 *restrict mie3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -50,20 +50,20 @@ __kernel void k_mie(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii class Mie : public BaseAtomic { public: Mie(); - ~Mie(); + ~Mie(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -41,8 +41,8 @@ class Mie : public BaseAtomic { double **host_mie1, double **host_mie2, double **host_mie3, double **host_mie4, double **host_gamA, double **host_gamR, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -67,7 +67,7 @@ class Mie : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_mie_ext.cpp b/lib/gpu/lal_mie_ext.cpp index d7c4187a42..f43cde2650 100644 --- a/lib/gpu/lal_mie_ext.cpp +++ b/lib/gpu/lal_mie_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -81,7 +81,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1, cell_size, gpu_split, screen); MLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -106,7 +106,7 @@ int ** mie_gpu_compute_n(const int ago, const int inum_full, return MLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void mie_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_morse.cpp b/lib/gpu/lal_morse.cpp index ddf7d843e6..cbdf928863 100644 --- a/lib/gpu/lal_morse.cpp +++ b/lib/gpu/lal_morse.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -33,20 +33,20 @@ MorseT::Morse() : BaseAtomic(), _allocated(false) { } template -MorseT::~Morse() { +MorseT::~Morse() { clear(); } - + template int MorseT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int MorseT::init(const int ntypes, - double **host_cutsq, double **host_morse1, - double **host_r0, double **host_alpha, - double **host_d0, double **host_offset, +int MorseT::init(const int ntypes, + double **host_cutsq, double **host_morse1, + double **host_r0, double **host_alpha, + double **host_d0, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -125,7 +125,7 @@ void MorseT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -135,14 +135,14 @@ void MorseT::loop(const bool _eflag, const bool _vflag) { if (shared_types) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &mor1, &mor2, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu index 2015c71cb2..0a14071d19 100644 --- a/lib/gpu/lal_morse.cu +++ b/lib/gpu/lal_morse.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -26,13 +26,13 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_morse(const __global numtyp4 *restrict x_, +__kernel void k_morse(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict mor1, - const __global numtyp2 *restrict mor2, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp2 *restrict mor2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, @@ -59,13 +59,13 @@ __kernel void k_morse(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; for ( ; nbor0) { numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y; - energy+=e*factor_lj; + energy+=e*factor_lj; } if (vflag>0) { virial[0] += delx*delx*force; @@ -111,15 +111,15 @@ __kernel void k_morse(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_morse_fast(const __global numtyp4 *restrict x_, +__kernel void k_morse_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict mor1_in, - const __global numtyp2 *restrict mor2_in, + const __global numtyp2 *restrict mor2_in, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -134,30 +134,30 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_, if (eflag>0) mor2[tid]=mor2_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y; - energy+=e*factor_lj; + energy+=e*factor_lj; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_morse.h b/lib/gpu/lal_morse.h index e64852f315..ef80fb4235 100644 --- a/lib/gpu/lal_morse.h +++ b/lib/gpu/lal_morse.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Morse : public BaseAtomic { public: Morse(); - ~Morse(); + ~Morse(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,8 +40,8 @@ class Morse : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_morse1, double **host_r0, double **host_alpha, double **host_d0, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -66,7 +66,7 @@ class Morse : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _types; private: diff --git a/lib/gpu/lal_morse_ext.cpp b/lib/gpu/lal_morse_ext.cpp index 3994473fd3..d07a83cd34 100644 --- a/lib/gpu/lal_morse_ext.cpp +++ b/lib/gpu/lal_morse_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -28,9 +28,9 @@ static Morse MORMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int mor_gpu_init(const int ntypes, double **cutsq, - double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { MORMF.clear(); @@ -55,7 +55,7 @@ int mor_gpu_init(const int ntypes, double **cutsq, int init_ok=0; if (world_me==0) - init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -78,7 +78,7 @@ int mor_gpu_init(const int ntypes, double **cutsq, cell_size, gpu_split, screen); MORMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -103,7 +103,7 @@ int** mor_gpu_compute_n(const int ago, const int inum_full, return MORMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void mor_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_neighbor_cpu.cu b/lib/gpu/lal_neighbor_cpu.cu index 384b88d9de..d005eb9f97 100644 --- a/lib/gpu/lal_neighbor_cpu.cu +++ b/lib/gpu/lal_neighbor_cpu.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -17,7 +17,7 @@ #include "lal_preprocessor.h" #endif -__kernel void kernel_unpack(__global int *dev_nbor, +__kernel void kernel_unpack(__global int *dev_nbor, const __global int *dev_ij, const int inum, const int t_per_atom) { int tid=THREAD_ID_X; @@ -33,7 +33,7 @@ __kernel void kernel_unpack(__global int *dev_nbor, list+=offset; nbor+=fast_mul(ii,t_per_atom-1)+offset; int stride=fast_mul(t_per_atom,inum); - + for ( ; list pos_tex; texture pos_tex; #endif -__kernel void calc_cell_id(const numtyp4 *restrict pos, - unsigned *restrict cell_id, +__kernel void calc_cell_id(const numtyp4 *restrict pos, + unsigned *restrict cell_id, int *restrict particle_id, - numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, - numtyp i_cell_size, int ncellx, int ncelly, - int ncellz, int inum, int nall, + numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, + numtyp i_cell_size, int ncellx, int ncelly, + int ncellz, int inum, int nall, int cells_in_cutoff) { int i = threadIdx.x + blockIdx.x*blockDim.x; @@ -48,11 +48,11 @@ __kernel void calc_cell_id(const numtyp4 *restrict pos, p.x -= boxlo0; p.y -= boxlo1; p.z -= boxlo2; - + int ix = int(p.x*i_cell_size+cells_in_cutoff); int iy = int(p.y*i_cell_size+cells_in_cutoff); int iz = int(p.z*i_cell_size+cells_in_cutoff); - + int offset_lo, offset_hi; if (i 0 && idx < nall) { int id_l = cell_id[idx-1]; if (id != id_l) { - for (int i = id_l+1; i <= id; i++) + for (int i = id_l+1; i <= id; i++) cell_counts[i] = idx; } } @@ -114,8 +114,8 @@ __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id, #endif #endif -__kernel void transpose(__global tagint *restrict out, - const __global tagint *restrict in, +__kernel void transpose(__global tagint *restrict out, + const __global tagint *restrict in, int columns_in, int rows_in) { __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; @@ -138,12 +138,12 @@ __kernel void transpose(__global tagint *restrict out, out[j*rows_in+i] = block[ti][tj]; } -__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, - const __global int *restrict cell_particle_id, - const __global int *restrict cell_counts, +__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, + const __global int *restrict cell_particle_id, + const __global int *restrict cell_counts, __global int *nbor_list, - __global int *host_nbor_list, - __global int *host_numj, + __global int *host_nbor_list, + __global int *host_numj, int neigh_bin_size, numtyp cell_size, int ncellx, int ncelly, int ncellz, int inum, int nt, int nall, int t_per_atom, @@ -154,7 +154,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, int iy = BLOCK_ID_Y % (ncelly - cells_in_cutoff*2) + cells_in_cutoff; int iz = BLOCK_ID_Y / (ncelly - cells_in_cutoff*2) + cells_in_cutoff; int bsx = BLOCK_SIZE_X; - + int icell = ix + iy*ncellx + iz*ncellx*ncelly; __local int cell_list_sh[BLOCK_NBOR_BUILD]; @@ -163,7 +163,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, int icell_begin = cell_counts[icell]; int icell_end = cell_counts[icell+1]; - int nborz0 = iz-cells_in_cutoff, nborz1 = iz+cells_in_cutoff, + int nborz0 = iz-cells_in_cutoff, nborz1 = iz+cells_in_cutoff, nbory0 = iy-cells_in_cutoff, nbory1 = iy+cells_in_cutoff, nborx0 = ix-cells_in_cutoff, nborx1 = ix+cells_in_cutoff; @@ -174,9 +174,9 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, int i = icell_begin + tid + ii*bsx; int pid_i = nall, pid_j, stride; numtyp4 atom_i, atom_j; - int cnt = 0; + int cnt = 0; __global int *neigh_counts, *neigh_list; - + if (i < icell_end) pid_i = cell_particle_id[i]; @@ -194,7 +194,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, neigh_counts=host_numj+pid_i-inum; neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size; } - + // loop through neighbors for (int nborz = nborz0; nborz <= nborz1; nborz++) { @@ -206,13 +206,13 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, int jcell_begin = cell_counts[jcell]; int jcell_end = cell_counts[jcell+1]; int num_atom_cell = jcell_end - jcell_begin; - + // load jcell to shared memory int num_iter = ucl_ceil((numtyp)num_atom_cell/bsx); for (int k = 0; k < num_iter; k++) { int end_idx = min(bsx, num_atom_cell-k*bsx); - + if (tid < end_idx) { pid_j = cell_particle_id[tid+k*bsx+jcell_begin]; cell_list_sh[tid] = pid_j; @@ -222,9 +222,9 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, pos_sh[tid].z = atom_j.z; } __syncthreads(); - + if (pid_i < nt) { - + for (int j = 0; j < end_idx; j++) { int pid_j = cell_list_sh[j]; // gather from shared memory diff.x = atom_i.x - pos_sh[j].x; @@ -253,11 +253,11 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, } // for (i) } -__kernel void kernel_special(__global int *dev_nbor, - __global int *host_nbor_list, - const __global int *host_numj, +__kernel void kernel_special(__global int *dev_nbor, + __global int *host_nbor_list, + const __global int *host_numj, const __global tagint *restrict tag, - const __global int *restrict nspecial, + const __global int *restrict nspecial, const __global tagint *restrict special, int inum, int nt, int max_nbors, int t_per_atom) { int tid=THREAD_ID_X; @@ -268,7 +268,7 @@ __kernel void kernel_special(__global int *dev_nbor, if (iigpu_bytes(); - + _order=order; _order_m_1=order-1; _order2=_order_m_1*_order; @@ -130,7 +130,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, view.view(rho_coeff[0]+n2lo,numel,*ucl_device); ucl_copy(d_rho_coeff,view,true); _max_bytes+=d_rho_coeff.row_bytes(); - + // Allocate storage for grid _npts_x=nxhi_out-nxlo_out+1; _npts_y=nyhi_out-nylo_out+1; @@ -165,10 +165,10 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, flag=-3; return 0; } - + error_flag.device.zero(); _max_bytes+=1; - + _cpu_idle_time=0.0; return brick.host.begin(); @@ -180,13 +180,13 @@ void PPPMT::clear(const double cpu_time) { return; _allocated=false; _precompute_done=false; - + brick.clear(); vd_brick.clear(); d_brick_counts.clear(); error_flag.clear(); d_brick_atoms.clear(); - + acc_timers(); device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp, *ans,_max_bytes+_max_an_bytes,cpu_time, @@ -216,7 +216,7 @@ void PPPMT::clear(const double cpu_time) { template void PPPMT::_precompute(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, - double *host_q, double *boxlo, + double *host_q, double *boxlo, const double delxinv, const double delyinv, const double delzinv) { acc_timers(); @@ -224,7 +224,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, zero_timers(); return; } - + ans->inum(nlocal); if (ago==0) { @@ -250,7 +250,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); int ainum=this->ans->inum(); - + // Boxlo adjusted to be upper left brick and shift for even spline order double shift=0.0; if (_order % 2) @@ -258,7 +258,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, _brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv; _brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv; _brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv; - + _delxinv=delxinv; _delyinv=delyinv; _delzinv=delzinv; @@ -268,7 +268,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, device->zero(d_brick_counts,d_brick_counts.numel()); k_particle_map.set_size(GX,BX); k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum, - &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y, + &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, &error_flag); @@ -299,7 +299,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, template int PPPMT::spread(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, - double *host_q, double *boxlo, + double *host_q, double *boxlo, const double delxinv, const double delyinv, const double delzinv) { if (_precompute_done==false) { @@ -309,10 +309,10 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall, } device->stop_host_timer(); - + if (!success || nlocal==0) return 0; - + double t=MPI_Wtime(); time_out.sync_stop(); _cpu_idle_time+=MPI_Wtime()-t; @@ -325,10 +325,10 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall, error_flag.device.zero(); d_brick_atoms.resize(_atom_stride*_max_brick_atoms); _max_bytes+=d_brick_atoms.row_bytes(); - return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, + return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, delxinv,delyinv,delzinv); } - + return error_flag[0]; } @@ -340,18 +340,18 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) { time_in.start(); vd_brick.update_device(true); time_in.stop(); - + time_interp.start(); // Compute the block size and grid size to keep all cores busy int BX=this->block_size(); int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); int ainum=this->ans->inum(); - + k_interp.set_size(GX,BX); k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff, &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv, - &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale, + &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale, &ans->force); time_interp.stop(); @@ -381,7 +381,7 @@ void PPPMT::compile_kernels(UCL_Device &dev) { #endif pppm_program=new UCL_Program(dev); - + #ifdef USE_OPENCL pppm_program->load_string(pppm,flags.c_str()); #else diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu index 99fe655dfd..11703d6d2a 100644 --- a/lib/gpu/lal_pppm.cu +++ b/lib/gpu/lal_pppm.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -48,17 +48,17 @@ texture q_tex; // Number of pencils per block for charge spread #define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE) -__kernel void particle_map(const __global numtyp4 *restrict x_, +__kernel void particle_map(const __global numtyp4 *restrict x_, const __global numtyp *restrict q_, - const grdtyp delvolinv, const int nlocal, - __global int *restrict counts, - __global grdtyp4 *restrict ans, + const grdtyp delvolinv, const int nlocal, + __global int *restrict counts, + __global grdtyp4 *restrict ans, const grdtyp b_lo_x, const grdtyp b_lo_y, const grdtyp b_lo_z, const grdtyp delxinv, const grdtyp delyinv, const grdtyp delzinv, const int nlocal_x, const int nlocal_y, const int nlocal_z, const int atom_stride, - const int max_atoms, + const int max_atoms, __global int *restrict error) { // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X; @@ -76,7 +76,7 @@ __kernel void particle_map(const __global numtyp4 *restrict x_, grdtyp4 delta; fetch(delta.w,ii,q_tex); delta.w*=delvolinv; - + if (delta.w!=(grdtyp)0.0) { delta.x=(p.x-b_lo_x)*delxinv; nx=delta.x; @@ -85,14 +85,14 @@ __kernel void particle_map(const __global numtyp4 *restrict x_, delta.z=(p.z-b_lo_z)*delzinv; nz=delta.z; - if (delta.x<(grdtyp)0 || delta.y<(grdtyp)0 || delta.z<(grdtyp)0 || + if (delta.x<(grdtyp)0 || delta.y<(grdtyp)0 || delta.z<(grdtyp)0 || nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z) *error=1; else { delta.x=nx+(grdtyp)0.5-delta.x; delta.y=ny+(grdtyp)0.5-delta.y; delta.z=nz+(grdtyp)0.5-delta.z; - + int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx; int old=atom_add(counts+i, 1); if (old>=max_atoms) { @@ -107,9 +107,9 @@ __kernel void particle_map(const __global numtyp4 *restrict x_, /* --------------------------- */ -__kernel void make_rho(const __global int *restrict counts, +__kernel void make_rho(const __global int *restrict counts, const __global grdtyp4 *restrict atoms, - __global grdtyp *restrict brick, + __global grdtyp *restrict brick, const __global grdtyp *restrict _rho_coeff, const int atom_stride, const int npts_x, const int npts_y, const int npts_z, const int nlocal_x, @@ -118,15 +118,15 @@ __kernel void make_rho(const __global int *restrict counts, __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE]; __local grdtyp front[BLOCK_PENCILS][PENCIL_SIZE+PPPM_MAX_SPLINE]; __local grdtyp ans[PPPM_MAX_SPLINE][PPPM_BLOCK_1D]; - + int tid=THREAD_ID_X; if (tid -1; k-=order) { @@ -184,14 +184,14 @@ __kernel void make_rho(const __global int *restrict counts, z_pos+=z_stride; } } - + __syncthreads(); if (fid *device; @@ -142,21 +142,21 @@ class PPPM { UCL_Vector brick; UCL_Vector vd_brick; - + // Count of number of atoms assigned to each grid point UCL_D_Vec d_brick_counts; // Atoms assigned to each grid point UCL_D_Vec d_brick_atoms; - + // Error checking for out of bounds atoms UCL_Vector error_flag; - + // Number of grid points in brick (including ghost) int _npts_x, _npts_y, _npts_z, _npts_yx; - + // Number of local grid points in brick int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride; - + // -------------------------- SPLINE DATA ------------------------- UCL_D_Vec d_rho_coeff; int _order, _nlower, _nupper, _order_m_1, _order2; @@ -180,12 +180,12 @@ class PPPM { int _block_size, _block_pencils, _pencil_size, _max_brick_atoms, _max_atoms; double _max_bytes, _max_an_bytes; double _cpu_idle_time; - - grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv; + + grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv; double _slab_volfactor; int _nx_pppm, _ny_pppm, _nz_pppm; - + void compile_kernels(UCL_Device &dev); void _precompute(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp index 6e5a82af5b..7e07d6c87b 100644 --- a/lib/gpu/lal_pppm_ext.cpp +++ b/lib/gpu/lal_pppm_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ static PPPM PPPMD; // --------------------------------------------------------------------------- template grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, - FILE *screen, const int order, const int nxlo_out, + FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, grdtyp **rho_coeff, @@ -82,7 +82,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, split,success); pppm.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -91,7 +91,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, } float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, - const int order, const int nxlo_out, + const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, float **rho_coeff, @@ -102,7 +102,7 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success); if (split==false && respa==false) - PPPMF.device->set_single_precompute(&PPPMF); + PPPMF.device->set_single_precompute(&PPPMF); return b; } @@ -133,20 +133,20 @@ void pppm_gpu_forces_f(double **f) { } double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen, - const int order, const int nxlo_out, + const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, double **vd_brick, const double slab_volfactor, const int nx_pppm, const int ny_pppm, - const int nz_pppm, const bool split, + const int nz_pppm, const bool split, const bool respa, int &success) { double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff, vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm, - split,success); + split,success); if (split==false && respa==false) - PPPMD.device->set_double_precompute(&PPPMD); + PPPMD.device->set_double_precompute(&PPPMD); return b; } diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h index 810afb4c88..d5b1b9b6c0 100644 --- a/lib/gpu/lal_precision.h +++ b/lib/gpu/lal_precision.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -49,17 +49,17 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) { out << v.x << " " << v.y; return out; } - + inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) { out << v.x << " " << v.y << " " << v.z; return out; } - + inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) { out << v.x << " " << v.y; return out; } - + inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { out << v.x << " " << v.y << " " << v.z; return out; diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index 9dbb3c5944..69a8e61bd4 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -9,16 +9,16 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ //************************************************************************* // Preprocessor Definitions -// +// // Note: It is assumed that constants with the same names are defined with // the same values in all files. -// +// // ARCH // Definition: Architecture number for accelerator // MEM_THREADS @@ -35,22 +35,22 @@ // Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE // PPPM_MAX_SPLINE // Definition: Maximum order for splines in PPPM -// PPPM_BLOCK_1D +// PPPM_BLOCK_1D // Definition: Thread block size for PPPM kernels // Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE -// PPPM_BLOCK_1D%32==0 +// PPPM_BLOCK_1D%32==0 // BLOCK_PAIR // Definition: Default thread block size for pair styles // Restrictions: // MAX_SHARED_TYPES 8 // Definition: Max # of atom type params can be stored in shared memory // Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR -// BLOCK_CELL_2D +// BLOCK_CELL_2D // Definition: Default block size in each dimension for cell list builds // and matrix transpose -// BLOCK_CELL_ID +// BLOCK_CELL_ID // Definition: Default block size for binning atoms in cell list builds -// BLOCK_NBOR_BUILD +// BLOCK_NBOR_BUILD // Definition: Default block size for neighbor list builds // BLOCK_BIO_PAIR // Definition: Default thread block size for "bio" pair styles @@ -78,10 +78,10 @@ #define BLOCK_SIZE_Y blockDim.y #define __kernel extern "C" __global__ #define __local __shared__ -#define __global +#define __global #define restrict __restrict__ #define atom_add atomicAdd -#define ucl_inline static __inline__ __device__ +#define ucl_inline static __inline__ __device__ #ifdef __CUDA_ARCH__ #define ARCH __CUDA_ARCH__ diff --git a/lib/gpu/lal_re_squared.cpp b/lib/gpu/lal_re_squared.cpp index cbf50fab7d..55034aaf03 100644 --- a/lib/gpu/lal_re_squared.cpp +++ b/lib/gpu/lal_re_squared.cpp @@ -37,18 +37,18 @@ RESquaredT::RESquared() : BaseEllipsoid(), } template -RESquaredT::~RESquared() { +RESquaredT::~RESquared() { clear(); } - + template int RESquaredT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom(max_nbors); } template -int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, - double **host_cutsq, double **host_sigma, +int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, double **host_epsilon, int **h_form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, const double *host_special_lj, @@ -97,7 +97,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY); dev_error.zero(); - + // Allocate, cast and asynchronous memcpy of constant data // Copy data for bonded interactions special_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -127,7 +127,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, } view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device)); ucl_copy(well,view4,false); - + _allocated=true; this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+ lj1.row_bytes()+lj3.row_bytes()+special_lj.row_bytes()+ @@ -144,7 +144,7 @@ void RESquaredT::clear() { UCL_H_Vec err_flag(1,*(this->ucl_device)); ucl_copy(err_flag,dev_error,false); if (err_flag[0] == 2) - std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; + std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; err_flag.clear(); _allocated=false; @@ -158,7 +158,7 @@ void RESquaredT::clear() { shape.clear(); well.clear(); special_lj.clear(); - + this->clear_base(); } @@ -184,7 +184,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=0, NGX; int stride=this->nbor->nbor_pitch(); int ainum=this->ans->inum(); @@ -204,10 +204,10 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->k_ellipsoid.set_size(GX,BX); this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->special_lj, - &this->sigma_epsilon, &this->_lj_types, - &this->nbor->dev_nbor, &stride, + &this->sigma_epsilon, &this->_lj_types, + &this->nbor->dev_nbor, &stride, &this->ans->force,&ainum, &this->ans->engv, - &this->dev_error, &eflag, &vflag, + &this->dev_error, &eflag, &vflag, &this->_last_ellipse, &this->_threads_per_atom); this->time_ellipsoid.stop(); @@ -219,12 +219,12 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->time_ellipsoid2.start(); this->k_ellipsoid_sphere.set_size(GX,BX); - this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat, + this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->special_lj, - &this->sigma_epsilon, &this->_lj_types, + &this->sigma_epsilon, &this->_lj_types, &this->nbor->dev_nbor, &stride, &this->ans->force,&ainum, - &this->ans->engv, &this->dev_error, + &this->ans->engv, &this->dev_error, &eflag, &vflag, &this->_last_ellipse, &this->_threads_per_atom); this->time_ellipsoid2.stop(); @@ -251,12 +251,12 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->time_ellipsoid3.start(); this->k_sphere_ellipsoid.set_size(GX,BX); this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat, - &this->shape, &this->well, &this->special_lj, + &this->shape, &this->well, &this->special_lj, &this->sigma_epsilon, &this->_lj_types, - &this->nbor->dev_nbor, &stride, + &this->nbor->dev_nbor, &stride, &this->ans->force, &this->ans->engv, &this->dev_error, &eflag, &vflag, - &this->_last_ellipse, &ainum, + &this->_last_ellipse, &ainum, &this->_threads_per_atom); this->time_ellipsoid3.stop(); } else { @@ -266,13 +266,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->ans->force.zero(); this->ans->engv.zero(); this->time_nbor1.zero(); - this->time_ellipsoid.zero(); + this->time_ellipsoid.zero(); this->time_nbor2.zero(); this->time_ellipsoid2.zero(); this->time_nbor3.zero(); this->time_ellipsoid3.zero(); } - + // ------------ LJ --------------- this->time_lj.start(); if (this->_last_ellipseans->inum()) { @@ -287,7 +287,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { } else { this->k_lj.set_size(GX,BX); this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3, - &this->_lj_types, &this->special_lj, &stride, + &this->_lj_types, &this->special_lj, &stride, &this->nbor->dev_packed, &this->ans->force, &this->ans->engv, &this->dev_error, &eflag, &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); @@ -302,13 +302,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE, ELLIPSE_ELLIPSE,_shared_types,_lj_types); this->time_nbor1.stop(); - this->time_ellipsoid.start(); + this->time_ellipsoid.start(); this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, - &this->shape, &this->well, &this->special_lj, - &this->sigma_epsilon, &this->_lj_types, + this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + &this->shape, &this->well, &this->special_lj, + &this->sigma_epsilon, &this->_lj_types, &this->nbor->dev_nbor, &stride, &this->ans->force, - &ainum, &this->ans->engv, &this->dev_error, + &ainum, &this->ans->engv, &this->dev_error, &eflag, &vflag, &ainum, &this->_threads_per_atom); this->time_ellipsoid.stop(); } diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu index 3a65ce14ce..e238734074 100644 --- a/lib/gpu/lal_re_squared.cu +++ b/lib/gpu/lal_re_squared.cu @@ -34,31 +34,31 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9]) __kernel void k_resquared(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict q, - const __global numtyp4 *restrict shape, - const __global numtyp4 *restrict well, - const __global numtyp *restrict splj, - const __global numtyp2 *restrict sig_eps, - const int ntypes, + const __global numtyp4 *restrict shape, + const __global numtyp4 *restrict well, + const __global numtyp *restrict splj, + const __global numtyp2 *restrict sig_eps, + const int ntypes, const __global int *dev_nbor, - const int stride, + const int stride, __global acctyp4 *restrict ans, - const int astride, + const int astride, __global acctyp *restrict engv, - __global int *restrict err_flag, + __global int *restrict err_flag, const int eflag, const int vflag, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; - sp_lj[0]=splj[0]; - sp_lj[1]=splj[1]; - sp_lj[2]=splj[2]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; sp_lj[3]=splj[3]; - + __local numtyp b_alpha, cr60; b_alpha=(numtyp)45.0/(numtyp)56.0; - cr60=ucl_cbrt((numtyp)60.0); + cr60=ucl_cbrt((numtyp)60.0); acctyp energy=(acctyp)0; acctyp4 f; @@ -79,7 +79,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; @@ -91,14 +91,14 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, numtyp lAtwo1_0[9], lAtwo1_1[9], lAtwo1_2[9]; // A'*S^2*lA numtyp lAsa1_0[9], lAsa1_1[9], lAsa1_2[9]; // lAtwo+lA'*sa numtyp4 ishape; - + ishape=shape[itype]; numtyp4 ishape2; ishape2.x=ishape.x*ishape.x; ishape2.y=ishape.y*ishape.y; ishape2.z=ishape.z*ishape.z; numtyp ilshape = ishape.x*ishape.y*ishape.z; - + { numtyp aTs[9]; // A1'*S1^2 gpu_quat_to_mat_trans(q,i,a1); @@ -148,7 +148,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, numtyp a2[9]; // Rotation matrix (lab->body) numtyp gamma2[9]; // A'*S^2*A numtyp4 jshape; - + jshape=shape[jtype]; numtyp4 jshape2; jshape2.x=jshape.x*jshape.x; @@ -189,7 +189,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, H12[7] = gamma1[7]*sigma1+gamma2[7]*sigma2; H12[8] = gamma1[8]*sigma1+gamma2[8]*sigma2; dH=gpu_det3(H12); - + numtyp sigma1p2, sigma2p2, lambda, nu; sigma1p2 = sigma1*sigma1; sigma2p2 = sigma2*sigma2; @@ -299,7 +299,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, hsec = ucl_recip(h12+(numtyp)3.0*sec); dspu = ucl_recip(h12)-hsec+stemp; pbsu = (numtyp)3.0*sigma*hsec; - + numtyp dspr, pbsr; stemp = ucl_recip(ishape.x*cr60+h12)+ ucl_recip(ishape.y*cr60+h12)+ @@ -310,7 +310,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, hsec = ucl_recip(h12+b_alpha*sec); dspr = (numtyp)7.0/h12-hsec+stemp; pbsr = b_alpha*sigma*hsec; - + numtyp dH12[9]; numtyp dUa, dUr, deta, dchi, ddH, dh12; numtyp dsigma1, dsigma2; diff --git a/lib/gpu/lal_re_squared.h b/lib/gpu/lal_re_squared.h index c7441ed83e..8dc137d829 100644 --- a/lib/gpu/lal_re_squared.h +++ b/lib/gpu/lal_re_squared.h @@ -25,14 +25,14 @@ template class RESquared : public BaseEllipsoid { public: RESquared(); - ~RESquared(); + ~RESquared(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device + * \param gpu_split fraction of particles handled by device * \return false if there is not sufficient memory or device init prob - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -41,7 +41,7 @@ class RESquared : public BaseEllipsoid { * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_shape, double **host_well, double **host_cutsq, double **host_sigma, double **host_epsilon, - int **h_form, double **host_lj1, double **host_lj2, + int **h_form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, const double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -50,7 +50,7 @@ class RESquared : public BaseEllipsoid { /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); - + /// Returns memory usage on device per atom int bytes_per_atom(const int max_nbors) const; @@ -59,8 +59,8 @@ class RESquared : public BaseEllipsoid { /// Device Error Flag - Set if a bad matrix inversion occurs UCL_D_Vec dev_error; - - // --------------------------- TYPE DATA -------------------------- + + // --------------------------- TYPE DATA -------------------------- /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form UCL_D_Vec lj1; @@ -70,12 +70,12 @@ class RESquared : public BaseEllipsoid { UCL_D_Vec sigma_epsilon; /// special lj 0-4 UCL_D_Vec special_lj; - + /// If atom type constants fit in shared memory, use fast kernels bool _shared_types; int _lj_types; - - // --------------------------- ATOM DATA -------------------------- + + // --------------------------- ATOM DATA -------------------------- /// Aspherical Const Data for Atoms UCL_D_Vec shape, well; diff --git a/lib/gpu/lal_re_squared_ext.cpp b/lib/gpu/lal_re_squared_ext.cpp index e1d8fffb8f..b719dfe05f 100644 --- a/lib/gpu/lal_re_squared_ext.cpp +++ b/lib/gpu/lal_re_squared_ext.cpp @@ -28,8 +28,8 @@ static RESquared REMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, - double **sigma, double **epsilon, - int **form, double **host_lj1, double **host_lj2, + double **sigma, double **epsilon, + int **form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, @@ -56,7 +56,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, int init_ok=0; if (world_me==0) - init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, + init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, form, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); @@ -64,7 +64,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, REMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); - + for (int i=0; igpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,8 +102,8 @@ void re_gpu_clear() { int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double **host_quat); @@ -114,8 +114,8 @@ int** re_gpu_compute_n(const int ago, const int inum_full, const int nall, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat) { - return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, - tag, nspecial, special, eflag, vflag, eatom, vatom, + return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, + tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_quat); } diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu index 4742e5bd8e..d69dae2461 100644 --- a/lib/gpu/lal_re_squared_lj.cu +++ b/lib/gpu/lal_re_squared_lj.cu @@ -129,32 +129,32 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict q, - const __global numtyp4 *restrict shape, + const __global numtyp4 *restrict shape, const __global numtyp4 *restrict well, - const __global numtyp *restrict splj, + const __global numtyp *restrict splj, const __global numtyp2 *restrict sig_eps, - const int ntypes, + const int ntypes, const __global int *dev_nbor, - const int stride, + const int stride, __global acctyp4 *restrict ans, - const int astride, - __global acctyp *restrict engv, - __global int *restrict err_flag, - const int eflag, const int vflag, - const int inum, + const int astride, + __global acctyp *restrict engv, + __global int *restrict err_flag, + const int eflag, const int vflag, + const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; - sp_lj[0]=splj[0]; - sp_lj[1]=splj[1]; - sp_lj[2]=splj[2]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; sp_lj[3]=splj[3]; - + __local numtyp b_alpha, cr60, solv_f_a, solv_f_r; b_alpha=(numtyp)45.0/(numtyp)56.0; - cr60=ucl_cbrt((numtyp)60.0); + cr60=ucl_cbrt((numtyp)60.0); solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0); solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); @@ -177,7 +177,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; @@ -223,7 +223,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, sigma = sig_eps[mtype].x; epsilon = sig_eps[mtype].y*factor_lj; - numtyp aTs[9]; + numtyp aTs[9]; numtyp4 scorrect; numtyp half_sigma=sigma*(numtyp)0.5; scorrect.x = ishape.x+half_sigma; @@ -260,7 +260,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0; Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua; Ua = epsilon*Ua*sigmap3*solv_f_a; - + stemp = h12/cr60; Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/ (numtyp)60.0; @@ -290,7 +290,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, numtyp hsec = ucl_recip(h12+(numtyp)3.0*sec); numtyp dspu = ucl_recip(h12)-hsec+stemp; numtyp pbsu = (numtyp)3.0*sigma*hsec; - + stemp = ucl_recip(ishape.x*cr60+h12)+ ucl_recip(ishape.y*cr60+h12)+ ucl_recip(ishape.z*cr60+h12)+ @@ -298,7 +298,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, hsec = ucl_recip(h12+b_alpha*sec); numtyp dspr = (numtyp)7.0/h12-hsec+stemp; numtyp pbsr = b_alpha*sigma*hsec; - + #pragma unroll for (int i=0; i<3; i++) { numtyp u[3]; @@ -334,7 +334,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, } } - + // torque on i numtyp fwae[3]; gpu_row_times3(fourw,aTe,fwae); @@ -384,33 +384,33 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, } __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict q, + const __global numtyp4 *restrict q, const __global numtyp4 *restrict shape, const __global numtyp4 *restrict well, const __global numtyp *restrict splj, const __global numtyp2 *restrict sig_eps, - const int ntypes, + const int ntypes, const __global int *dev_nbor, - const int stride, + const int stride, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, __global int *restrict err_flag, const int eflag, const int vflag, - const int start, const int inum, + const int start, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; __local numtyp sp_lj[4]; - sp_lj[0]=splj[0]; - sp_lj[1]=splj[1]; - sp_lj[2]=splj[2]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; sp_lj[3]=splj[3]; - + __local numtyp b_alpha, cr60, solv_f_a, solv_f_r; b_alpha=(numtyp)45.0/(numtyp)56.0; - cr60=ucl_cbrt((numtyp)60.0); + cr60=ucl_cbrt((numtyp)60.0); solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0); solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); @@ -429,7 +429,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj, n_stride,nbor_end,nbor); - + numtyp4 jx; fetch4(jx,j,pos_tex); int jtype=jx.w; @@ -445,7 +445,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, numtyp a[9]; // Rotation matrix (lab->body) numtyp aTe[9]; // A'*E numtyp4 ishape; - + ishape=shape[itype]; gpu_quat_to_mat_trans(q,i,a); gpu_transpose_times_diag3(a,well[itype],aTe); @@ -467,7 +467,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, sigma = sig_eps[mtype].x; epsilon = sig_eps[mtype].y*factor_lj; - numtyp aTs[9]; + numtyp aTs[9]; numtyp4 scorrect; numtyp half_sigma=sigma * (numtyp)0.5; scorrect.x = ishape.x+half_sigma; @@ -477,7 +477,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5; scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5; gpu_transpose_times_diag3(a,scorrect,aTs); - + // energy numtyp gamma[9], s[3]; @@ -505,7 +505,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, numtyp ilshape=ishape.x*ishape.y*ishape.z; Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua; Ua = epsilon*Ua*sigmap3*solv_f_a; - + stemp = h12/cr60; Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/ (numtyp)60.0; @@ -535,7 +535,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, numtyp hsec = ucl_recip(h12+(numtyp)3.0*sec); numtyp dspu = ucl_recip(h12)-hsec+stemp; numtyp pbsu = (numtyp)3.0*sigma*hsec; - + stemp = ucl_recip(ishape.x*cr60+h12)+ ucl_recip(ishape.y*cr60+h12)+ ucl_recip(ishape.z*cr60+h12)+ @@ -543,7 +543,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, hsec = ucl_recip(h12+b_alpha*sec); numtyp dspr = (numtyp)7.0/h12-hsec+stemp; numtyp pbsr = b_alpha*sigma*hsec; - + #pragma unroll for (int i=0; i<3; i++) { numtyp u[3]; @@ -584,15 +584,15 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_resquared_lj(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict gum, - const int stride, - const __global int *dev_ij, +__kernel void k_resquared_lj(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict gum, + const int stride, + const __global int *dev_ij, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, __global int *restrict err_flag, const int eflag, const int vflag, const int start, const int inum, const int t_per_atom) { @@ -601,10 +601,10 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_, ii+=start; __local numtyp sp_lj[4]; - sp_lj[0]=gum[0]; - sp_lj[1]=gum[1]; - sp_lj[2]=gum[2]; - sp_lj[3]=gum[3]; + sp_lj[0]=gum[0]; + sp_lj[1]=gum[1]; + sp_lj[2]=gum[2]; + sp_lj[3]=gum[3]; acctyp energy=(acctyp)0; acctyp4 f; @@ -614,20 +614,20 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y); - energy+=factor_lj*(e-lj3[ii].z); + energy+=factor_lj*(e-lj3[ii].z); } if (vflag>0) { virial[0] += delx*delx*force; @@ -671,33 +671,33 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, - const __global numtyp *restrict gum, +__kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1_in, + const __global numtyp4 *restrict lj3_in, + const __global numtyp *restrict gum, const int stride, const __global int *dev_ij, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, __global int *restrict err_flag, const int eflag, const int vflag, - const int start, const int inum, + const int start, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; - __local numtyp sp_lj[4]; + __local numtyp sp_lj[4]; __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; if (tid<4) - sp_lj[tid]=gum[tid]; + sp_lj[tid]=gum[tid]; if (tid0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; @@ -706,9 +706,9 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_soft.cpp b/lib/gpu/lal_soft.cpp index c206a997a9..337bdd6738 100644 --- a/lib/gpu/lal_soft.cpp +++ b/lib/gpu/lal_soft.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,10 +33,10 @@ SoftT::Soft() : BaseAtomic(), _allocated(false) { } template -SoftT::~Soft() { +SoftT::~Soft() { clear(); } - + template int SoftT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -89,14 +89,14 @@ int SoftT::init(const int ntypes, double **host_cutsq, template void SoftT::reinit(const int ntypes, double **host_cutsq, double **host_prefactor, double **host_cut) { - + // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_prefactor, host_cut,host_cutsq); } @@ -134,7 +134,7 @@ void SoftT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); diff --git a/lib/gpu/lal_soft.cu b/lib/gpu/lal_soft.cu index b7c32b6879..831b986725 100644 --- a/lib/gpu/lal_soft.cu +++ b/lib/gpu/lal_soft.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -26,7 +26,7 @@ texture pos_tex; #define MY_PI (acctyp)3.14159265358979323846 -__kernel void k_soft(const __global numtyp4 *restrict x_, +__kernel void k_soft(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff, const int lj_types, const __global numtyp *restrict sp_lj_in, @@ -51,20 +51,20 @@ __kernel void k_soft(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii (numtyp)0.0) force = factor_lj * coeff[mtype].x * sin(arg) * MY_PI/coeff[mtype].y*ucl_recip(r); else force = (numtyp)0.0; - + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg)); - energy+=factor_lj*e; + energy+=factor_lj*e; } if (vflag>0) { virial[0] += delx*delx*force; @@ -111,7 +111,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_soft_fast(const __global numtyp4 *restrict x_, +__kernel void k_soft_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff_in, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, @@ -122,7 +122,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -130,7 +130,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_, if (tid (numtyp)0.0) force = factor_lj * coeff[mtype].x * sin(arg) * MY_PI/coeff[mtype].y*ucl_recip(r); else force = (numtyp)0.0; - + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg)); - energy+=factor_lj*e; + energy+=factor_lj*e; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_soft.h b/lib/gpu/lal_soft.h index 7fa529c4f5..e72673248c 100644 --- a/lib/gpu/lal_soft.h +++ b/lib/gpu/lal_soft.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Soft : public BaseAtomic { public: Soft(); - ~Soft(); + ~Soft(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,14 +40,14 @@ class Soft : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_prefactor, double **host_cut, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_cutsq, double **host_prefactor, double **host_cut); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -68,7 +68,7 @@ class Soft : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kßernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_soft_ext.cpp b/lib/gpu/lal_soft_ext.cpp index 9591923965..441fe35839 100644 --- a/lib/gpu/lal_soft_ext.cpp +++ b/lib/gpu/lal_soft_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -77,7 +77,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor, cell_size, gpu_split, screen); SLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -96,16 +96,16 @@ void soft_gpu_reinit(const int ntypes, double **cutsq, double **host_prefactor, int world_me=SLMF.device->world_me(); int gpu_rank=SLMF.device->gpu_rank(); int procs_per_gpu=SLMF.device->procs_per_gpu(); - + if (world_me==0) SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut); - + SLMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } } @@ -124,7 +124,7 @@ int ** soft_gpu_compute_n(const int ago, const int inum_full, return SLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void soft_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_sw_ext.cpp b/lib/gpu/lal_sw_ext.cpp index e2d1b5e4dd..8cb51307a1 100644 --- a/lib/gpu/lal_sw_ext.cpp +++ b/lib/gpu/lal_sw_ext.cpp @@ -27,14 +27,14 @@ static SW SWMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, +int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, const double cell_size, int &gpu_mode, FILE *screen, int* host_map, const int nelements, int*** host_elem2param, const int nparams, const double* sw_epsilon, const double* sw_sigma, const double* sw_lambda, const double* sw_gamma, const double* sw_costheta, const double* sw_biga, const double* sw_bigb, const double* sw_powerp, - const double* sw_powerq, const double* sw_cut, + const double* sw_powerq, const double* sw_cut, const double* sw_cutsq) { SWMF.clear(); gpu_mode=SWMF.device->gpu_mode(); @@ -46,7 +46,7 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_ int procs_per_gpu=SWMF.device->procs_per_gpu(); // disable host/device split for now - if (gpu_split != 1.0) + if (gpu_split != 1.0) return -8; SWMF.device->init_message(screen,"sw/gpu",first_gpu,last_gpu); @@ -64,7 +64,7 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_ if (world_me==0) init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, - sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, + sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq); SWMF.device->world_barrier(); @@ -83,12 +83,12 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_ if (gpu_rank==i && world_me!=0) init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, - sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, - sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, + sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, + sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq); SWMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -113,12 +113,12 @@ int ** sw_gpu_compute_n(const int ago, const int inum_full, return SWMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} -void sw_gpu_compute(const int ago, const int nlocal, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, +void sw_gpu_compute(const int ago, const int nlocal, const int nall, + const int nlist, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { SWMF.compute(ago,nlocal,nall,nlist,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); diff --git a/lib/gpu/lal_table.cpp b/lib/gpu/lal_table.cpp index c99bf85815..0de59c84b2 100644 --- a/lib/gpu/lal_table.cpp +++ b/lib/gpu/lal_table.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -34,35 +34,35 @@ using namespace LAMMPS_AL; extern Device device; template -TableT::Table() : BaseAtomic(), +TableT::Table() : BaseAtomic(), _allocated(false), _compiled_styles(false) { } template -TableT::~Table() { +TableT::~Table() { clear(); } - + template int TableT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int TableT::init(const int ntypes, +int TableT::init(const int ntypes, double **host_cutsq, double ***host_table_coeffs, double **host_table_data, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen, + const double gpu_split, FILE *_screen, int tabstyle, int ntables, int tablength) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, gpu_split,_screen,table,"k_table"); if (success!=0) return success; - + k_pair_linear.set_function(*(this->pair_program),"k_table_linear"); k_pair_linear_fast.set_function(*(this->pair_program),"k_table_linear_fast"); k_pair_spline.set_function(*(this->pair_program),"k_table_spline"); @@ -80,38 +80,38 @@ int TableT::init(const int ntypes, shared_types=true; } _lj_types=lj_types; - + _tabstyle = tabstyle; _ntables = ntables; if (tabstyle != BITMAP) _tablength = tablength; else _tablength = 1 << tablength; - + // Allocate a host write buffer for data initialization UCL_H_Vec host_write_int(lj_types*lj_types,*(this->ucl_device), UCL_WRITE_ONLY); - for (int i=0; iucl_device),UCL_READ_ONLY); nshiftbits.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); nmask.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); for (int ix=1; ix host_write(lj_types*lj_types,*(this->ucl_device), UCL_WRITE_ONLY); @@ -151,7 +151,7 @@ int TableT::init(const int ntypes, host_write2[n*_tablength+k].z = host_table_data[n][6*k+2]; // f host_write2[n*_tablength+k].w = (numtyp)0; } - } + } } ucl_copy(coeff3,host_write2,false); @@ -166,21 +166,21 @@ int TableT::init(const int ntypes, for (int n=0; n<_ntables; n++) { if (tabstyle == LINEAR) { for (int k=0; k<_tablength-1; k++) { - host_write2[n*_tablength+k].x = (numtyp)0; + host_write2[n*_tablength+k].x = (numtyp)0; host_write2[n*_tablength+k].y = host_table_data[n][6*k+3]; // de host_write2[n*_tablength+k].z = host_table_data[n][6*k+4]; // df host_write2[n*_tablength+k].w = (numtyp)0; } } else if (tabstyle == SPLINE) { for (int k=0; k<_tablength; k++) { - host_write2[n*_tablength+k].x = (numtyp)0; + host_write2[n*_tablength+k].x = (numtyp)0; host_write2[n*_tablength+k].y = host_table_data[n][6*k+3]; // e2 host_write2[n*_tablength+k].z = host_table_data[n][6*k+4]; // f2 host_write2[n*_tablength+k].w = (numtyp)0; } } else if (tabstyle == BITMAP) { for (int k=0; k<_tablength; k++) { - host_write2[n*_tablength+k].x = (numtyp)0; + host_write2[n*_tablength+k].x = (numtyp)0; host_write2[n*_tablength+k].y = host_table_data[n][6*k+3]; // de host_write2[n*_tablength+k].z = host_table_data[n][6*k+4]; // df host_write2[n*_tablength+k].w = host_table_data[n][6*k+5]; // drsq @@ -188,12 +188,12 @@ int TableT::init(const int ntypes, } } ucl_copy(coeff4,host_write2,false); - + UCL_H_Vec host_rsq(lj_types*lj_types,*(this->ucl_device), UCL_WRITE_ONLY); cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq); - + UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); dview.view(host_special_lj,4,*(this->ucl_device)); @@ -220,7 +220,7 @@ void TableT::clear() { coeff3.clear(); coeff4.clear(); sp_lj.clear(); - + if (_compiled_styles) { k_pair_linear_fast.clear(); k_pair_linear.clear(); @@ -230,7 +230,7 @@ void TableT::clear() { k_pair_bitmap.clear(); _compiled_styles=false; } - + this->clear_atomic(); } @@ -256,7 +256,7 @@ void TableT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -269,67 +269,67 @@ void TableT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3, &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, - &this->ans->engv, &eflag, &vflag, &ainum, + &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == LINEAR) { this->k_pair_linear_fast.set_size(GX,BX); - this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2, + this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3, &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, + &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == SPLINE) { this->k_pair_spline_fast.set_size(GX,BX); - this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2, + this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3, &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, + &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == BITMAP) { this->k_pair_bitmap_fast.set_size(GX,BX); this->k_pair_bitmap_fast.run(&this->atom->x, &tabindex, &nshiftbits, &nmask, &coeff2, &coeff3, &coeff4, &cutsq, - &sp_lj, &this->nbor->dev_nbor, + &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, + &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); - } + } } else { if (_tabstyle == LOOKUP) { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3, - &coeff4, &_lj_types, &cutsq, &sp_lj, + this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3, + &coeff4, &_lj_types, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == LINEAR) { this->k_pair_linear.set_size(GX,BX); this->k_pair_linear.run(&this->atom->x, &tabindex, &coeff2, &coeff3, - &coeff4, &_lj_types, &cutsq, &sp_lj, + &coeff4, &_lj_types, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == SPLINE) { this->k_pair_spline.set_size(GX,BX); this->k_pair_spline.run(&this->atom->x, &tabindex, &coeff2, &coeff3, - &coeff4, &_lj_types, &cutsq, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &coeff4, &_lj_types, &cutsq, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == BITMAP) { this->k_pair_bitmap.set_size(GX,BX); - this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits, + this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits, &nmask, &coeff2, &coeff3, &coeff4, &_lj_types, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, + &nbor_pitch, &this->_threads_per_atom, &_tablength); } } diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu index 1033b7fbb8..971b56d96e 100644 --- a/lib/gpu/lal_table.cu +++ b/lib/gpu/lal_table.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -39,39 +39,39 @@ typedef union { /// ---------------- LOOKUP ------------------------------------------------- -__kernel void k_table(const __global numtyp4 *restrict x_, +__kernel void k_table(const __global numtyp4 *restrict x_, const __global int *restrict tabindex, - const __global numtyp4 *restrict coeff2, + const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff3, const __global numtyp4 *restrict coeff4, const int lj_types, const __global numtyp *restrict cutsq, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + int tlm1 = tablength - 1; - + if (ii0) { numtyp e = (numtyp)0.0; - if (itable < tlm1) + if (itable < tlm1) e = coeff3[idx].y; energy+=factor_lj*e; } @@ -136,21 +136,21 @@ __kernel void k_table(const __global numtyp4 *restrict x_, __kernel void k_table_fast(const __global numtyp4 *restrict x_, const __global int *restrict tabindex, - const __global numtyp4 *restrict coeff2, + const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff3, const __global numtyp4 *restrict coeff4, const __global numtyp *restrict cutsq_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -158,18 +158,18 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_, if (tid0) { numtyp e = (numtyp)0.0; - if (itable < tlm1) + if (itable < tlm1) e = coeff3[idx].y; energy+=factor_lj*e; } @@ -235,24 +235,24 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_, /// ---------------- LINEAR ------------------------------------------------- -__kernel void k_table_linear(const __global numtyp4 *restrict x_, +__kernel void k_table_linear(const __global numtyp4 *restrict x_, const __global int *restrict tabindex, - const __global numtyp4 *restrict coeff2, + const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff3, const __global numtyp4 *restrict coeff4, const int lj_types, const __global numtyp *restrict cutsq, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -265,9 +265,9 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + int tlm1 = tablength - 1; - + if (ii0) { numtyp e = (numtyp)0.0; - if (itable < tlm1) + if (itable < tlm1) e = coeff3[idx].y + fraction*coeff4[idx].y; energy+=factor_lj*e; } @@ -334,23 +334,23 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, +__kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, const __global int *restrict tabindex, - const __global numtyp4 *restrict coeff2, + const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff3, const __global numtyp4 *restrict coeff4, const __global numtyp *restrict cutsq_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -358,7 +358,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, if (tid0) { numtyp e = (numtyp)0.0; - if (itable < tlm1) + if (itable < tlm1) e = coeff3[idx].y + fraction*coeff4[idx].y; energy+=factor_lj*e; } @@ -439,39 +439,39 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, /// ---------------- SPLINE ------------------------------------------------- -__kernel void k_table_spline(const __global numtyp4 *restrict x_, +__kernel void k_table_spline(const __global numtyp4 *restrict x_, const __global int *restrict tabindex, - const __global numtyp4 *restrict coeff2, + const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff3, const __global numtyp4 *restrict coeff4, const int lj_types, const __global numtyp *restrict cutsq, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + int tlm1 = tablength - 1; - + if (ii0) { numtyp e = (numtyp)0.0; if (itable < tlm1) { - e = a * coeff3[idx].y + b * coeff3[idx+1].y + - ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) * + e = a * coeff3[idx].y + b * coeff3[idx+1].y + + ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) * coeff2[mtype].z; - } + } energy+=factor_lj*e; } if (vflag>0) { @@ -545,23 +545,23 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_table_spline_fast(const __global numtyp4 *x_, +__kernel void k_table_spline_fast(const __global numtyp4 *x_, const __global int *tabindex, - const __global numtyp4* coeff2, + const __global numtyp4* coeff2, const __global numtyp4 *coeff3, const __global numtyp4 *coeff4, const __global numtyp *cutsq_in, - const __global numtyp* sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *ans, - __global acctyp *engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global numtyp* sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *ans, + __global acctyp *engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -569,7 +569,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_, if (tid0) { numtyp e = (numtyp)0.0; if (itable < tlm1) { - e = a * coeff3[idx].y + b * coeff3[idx+1].y + - ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) * + e = a * coeff3[idx].y + b * coeff3[idx+1].y + + ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) * coeff2[mtype].z; - } + } energy+=factor_lj*e; } if (vflag>0) { @@ -657,41 +657,41 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_, /// ---------------- BITMAP ------------------------------------------------- -__kernel void k_table_bitmap(const __global numtyp4 *x_, +__kernel void k_table_bitmap(const __global numtyp4 *x_, const __global int *tabindex, - const __global int *nshiftbits, + const __global int *nshiftbits, const __global int *nmask, - const __global numtyp4* coeff2, + const __global numtyp4* coeff2, const __global numtyp4 *coeff3, const __global numtyp4 *coeff4, const int lj_types, const __global numtyp *cutsq, - const __global numtyp* sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *ans, - __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom, + const __global numtyp* sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *ans, + __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + int tlm1 = tablength - 1; - + if (ii>= nshiftbits[mtype]; @@ -734,14 +734,14 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_, value = coeff3[idx].z + fraction*coeff4[idx].z; force = factor_lj * value; } else force = (numtyp)0.0; - + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { numtyp e = (numtyp)0.0; - if (itable <= tlm1) + if (itable <= tlm1) e = coeff3[idx].y + fraction*coeff4[idx].y; energy+=factor_lj*e; } @@ -761,25 +761,25 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_, } // if ii } -__kernel void k_table_bitmap_fast(const __global numtyp4 *x_, +__kernel void k_table_bitmap_fast(const __global numtyp4 *x_, const __global int *tabindex, - const __global int *nshiftbits, + const __global int *nshiftbits, const __global int *nmask, - const __global numtyp4* coeff2, + const __global numtyp4* coeff2, const __global numtyp4 *coeff3, const __global numtyp4 *coeff4, const __global numtyp *cutsq_in, - const __global numtyp* sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *ans, - __global acctyp *engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global numtyp* sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *ans, + __global acctyp *engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -787,18 +787,18 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_, if (tid>= nshiftbits[mtype]; @@ -842,14 +842,14 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_, value = coeff3[idx].z + fraction*coeff4[idx].z; force = factor_lj * value; } else force = (numtyp)0.0; - + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { numtyp e = (numtyp)0.0; - if (itable <= tlm1) + if (itable <= tlm1) e = coeff3[idx].y + fraction*coeff4[idx].y; energy+=factor_lj*e; } diff --git a/lib/gpu/lal_table.h b/lib/gpu/lal_table.h index 0e04737d27..f667336679 100644 --- a/lib/gpu/lal_table.h +++ b/lib/gpu/lal_table.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Table : public BaseAtomic { public: Table(); - ~Table(); + ~Table(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -38,10 +38,10 @@ class Table : public BaseAtomic { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double** cutsq, double ***host_table_coeffs, - double **host_table_data, + double **host_table_data, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, int tabstyle, int ntables, int tablength); @@ -54,42 +54,42 @@ class Table : public BaseAtomic { /// Total host memory used by library for pair style double host_memory_usage() const; - + // ------------------------- DEVICE KERNELS ------------------------- UCL_Kernel k_pair_linear, k_pair_linear_fast; UCL_Kernel k_pair_spline, k_pair_spline_fast; UCL_Kernel k_pair_bitmap, k_pair_bitmap_fast; - + // --------------------------- TYPE DATA -------------------------- UCL_D_Vec tabindex, nshiftbits, nmask; - - /// coeff2.x = innersq, coeff2.y = invdelta, coeff2.z = deltasq6, + + /// coeff2.x = innersq, coeff2.y = invdelta, coeff2.z = deltasq6, UCL_D_Vec coeff2; - + /// coeff3.x = rsq, coeff3.y = e, coeff3.z = f UCL_D_Vec coeff3; - + /// coeff4.x = de, coeff4.y = df UCL_D_Vec coeff4; - + UCL_D_Vec cutsq; - + /// Special LJ values UCL_D_Vec sp_lj; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; - + /// Table style, length and number of tables int _tabstyle,_tablength,_ntables; - + private: bool _allocated, _compiled_styles; - + void loop(const bool _eflag, const bool _vflag); }; diff --git a/lib/gpu/lal_table_ext.cpp b/lib/gpu/lal_table_ext.cpp index 172acb7d39..4eb7e0ce1b 100644 --- a/lib/gpu/lal_table_ext.cpp +++ b/lib/gpu/lal_table_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ static Table TBMF; int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs, double **table_data, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, + const double cell_size, int &gpu_mode, FILE *screen, int tabstyle, int ntables, int tablength) { TBMF.clear(); gpu_mode=TBMF.device->gpu_mode(); @@ -55,7 +55,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs, int init_ok=0; if (world_me==0) init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data, - special_lj, inum, nall, 300, maxspecial, cell_size, + special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, tabstyle, ntables, tablength); TBMF.device->world_barrier(); @@ -73,11 +73,11 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs, } if (gpu_rank==i && world_me!=0) init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data, - special_lj, inum, nall, 300, maxspecial, cell_size, + special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, tabstyle, ntables, tablength); TBMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,7 +102,7 @@ int ** table_gpu_compute_n(const int ago, const int inum_full, return TBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void table_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_yukawa.cpp b/lib/gpu/lal_yukawa.cpp index 585dc069a0..88cb8cdb3c 100644 --- a/lib/gpu/lal_yukawa.cpp +++ b/lib/gpu/lal_yukawa.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,19 +33,19 @@ YukawaT::Yukawa() : BaseAtomic(), _allocated(false) { } template -YukawaT::~Yukawa() { +YukawaT::~Yukawa() { clear(); } - + template int YukawaT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int YukawaT::init(const int ntypes, +int YukawaT::init(const int ntypes, double **host_cutsq, double kappa, - double **host_a, double **host_offset, + double **host_a, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -83,7 +83,7 @@ int YukawaT::init(const int ntypes, ucl_copy(sp_lj,dview,false); _kappa = kappa; - + _allocated=true; this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes(); return 0; @@ -122,7 +122,7 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -134,7 +134,7 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &coeff, &_kappa, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu index b0c3b9978d..a8d637ec97 100644 --- a/lib/gpu/lal_yukawa.cu +++ b/lib/gpu/lal_yukawa.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,14 +24,14 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_yukawa(const __global numtyp4 *restrict x_, +__kernel void k_yukawa(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff, const numtyp kappa, const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -49,20 +49,20 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=coeff[mtype].x*screening*rinv; - energy+=factor_lj*(e-coeff[mtype].y); + energy+=factor_lj*(e-coeff[mtype].y); } if (vflag>0) { virial[0] += delx*delx*force; @@ -109,19 +109,19 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, +__kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff_in, - const numtyp kappa, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const numtyp kappa, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -129,7 +129,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, if (tid0) { numtyp e=coeff[mtype].x*screening*rinv; - energy+=factor_lj*(e-coeff[mtype].y); + energy+=factor_lj*(e-coeff[mtype].y); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_yukawa.h b/lib/gpu/lal_yukawa.h index 720dc903d0..4cc23c03e9 100644 --- a/lib/gpu/lal_yukawa.h +++ b/lib/gpu/lal_yukawa.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Yukawa : public BaseAtomic { public: Yukawa(); - ~Yukawa(); + ~Yukawa(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -39,8 +39,8 @@ class Yukawa : public BaseAtomic { * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, double kappa, double **host_a, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -57,16 +57,16 @@ class Yukawa : public BaseAtomic { /// coeff.x = a, coeff.y = offset, coeff.z = cutsq UCL_D_Vec coeff; - + /// Special LJ values UCL_D_Vec sp_lj; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; - + /// kappa numtyp _kappa; diff --git a/lib/gpu/lal_yukawa_colloid.cpp b/lib/gpu/lal_yukawa_colloid.cpp index 70282a7117..bfe398c62e 100644 --- a/lib/gpu/lal_yukawa_colloid.cpp +++ b/lib/gpu/lal_yukawa_colloid.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -29,23 +29,23 @@ using namespace LAMMPS_AL; extern Device device; template -YukawaColloidT::YukawaColloid() : BaseAtomic(), +YukawaColloidT::YukawaColloid() : BaseAtomic(), _max_rad_size(0), _allocated(false) { } template -YukawaColloidT::~YukawaColloid() { +YukawaColloidT::~YukawaColloid() { clear(); } - + template int YukawaColloidT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int YukawaColloidT::init(const int ntypes, - double **host_cutsq, double **host_a, +int YukawaColloidT::init(const int ntypes, + double **host_cutsq, double **host_a, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -62,16 +62,16 @@ int YukawaColloidT::init(const int ntypes, _shared_view=false; // allocate rad - + int ef_nall=nall; if (ef_nall==0) ef_nall=2000; - + _max_rad_size=static_cast(static_cast(ef_nall)*1.10); - + if (_shared_view==false) c_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); - + rad_tex.get_texture(*(this->pair_program),"rad_tex"); rad_tex.bind_float(c_rad,1); @@ -102,7 +102,7 @@ int YukawaColloidT::init(const int ntypes, sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); dview.view(host_special_lj,4,*(this->ucl_device)); ucl_copy(sp_lj,dview,false); - + _allocated=true; this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes(); return 0; @@ -131,15 +131,15 @@ double YukawaColloidT::host_memory_usage() const { // Copy nbor list from host if necessary and then compute atom energies/forces // --------------------------------------------------------------------------- template -void YukawaColloidT::compute(const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, int *ilist, +void YukawaColloidT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *rad) { this->acc_timers(); - + // ------------------- Resize rad array -------------------------- - + if (nall>_max_rad_size) { _max_rad_size=static_cast(static_cast(nall)*1.10); if (_shared_view==false) { @@ -157,7 +157,7 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full, this->zero_timers(); return; } - + int ago=this->hd_balancer.ago_first(f_ago); int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); this->ans->inum(inum); @@ -170,7 +170,7 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full, if (!success) return; } - + this->atom->cast_x_data(host_x,host_type); this->cast_rad_data(rad); this->hd_balancer.start_timer(); @@ -182,7 +182,7 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full, this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); } - + // --------------------------------------------------------------------------- // Reneighbor on GPU and then compute per-atom densities // --------------------------------------------------------------------------- @@ -190,24 +190,24 @@ template int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *rad) { this->acc_timers(); - + // ------------------- Resize rad array ---------------------------- - + if (nall>_max_rad_size) { _max_rad_size=static_cast(static_cast(nall)*1.10); if (_shared_view==false) { c_rad.resize(_max_rad_size); rad_tex.bind_float(c_rad,1); } - } + } // ----------------------------------------------------------------- - + if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -215,21 +215,21 @@ int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall this->zero_timers(); return NULL; } - + // load balance, returning the atom count on the device (inum) this->hd_balancer.balance(cpu_time); int inum=this->hd_balancer.get_gpu_count(ago,inum_full); this->ans->inum(inum); host_start=inum; - - // Build neighbor list on GPU if necessary + + // Build neighbor list on GPU if necessary if (ago==0) { this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; this->cast_rad_data(rad); - this->hd_balancer.start_timer(); + this->hd_balancer.start_timer(); } else { this->atom->cast_x_data(host_x,host_type); this->cast_rad_data(rad); @@ -265,7 +265,7 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -280,8 +280,8 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &c_rad, &coeff, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &c_rad, &coeff, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa); } diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu index f9f4767123..ad02f202a3 100644 --- a/lib/gpu/lal_yukawa_colloid.cu +++ b/lib/gpu/lal_yukawa_colloid.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -29,15 +29,15 @@ texture rad_tex; #define rad_tex rad_ #endif -__kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, +__kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, const __global numtyp *restrict rad_, - const __global numtyp4 *restrict coeff, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict coeff, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, const numtyp kappa) { @@ -56,21 +56,21 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=coeff[mtype].x/kappa * screening; - energy+=factor_lj*(e-coeff[mtype].y); + energy+=factor_lj*(e-coeff[mtype].y); } if (vflag>0) { virial[0] += delx*delx*force; @@ -118,20 +118,20 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, +__kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, const __global numtyp *restrict rad_, - const __global numtyp4 *restrict coeff_in, + const __global numtyp4 *restrict coeff_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom, const numtyp kappa) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -139,7 +139,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, if (tid0) { numtyp e=coeff[mtype].x/kappa * screening; - energy+=factor_lj*(e-coeff[mtype].y); + energy+=factor_lj*(e-coeff[mtype].y); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_yukawa_colloid.h b/lib/gpu/lal_yukawa_colloid.h index 5a9ee7ae6e..ba69bc4bae 100644 --- a/lib/gpu/lal_yukawa_colloid.h +++ b/lib/gpu/lal_yukawa_colloid.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class YukawaColloid : public BaseAtomic { public: YukawaColloid(); - ~YukawaColloid(); + ~YukawaColloid(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -39,8 +39,8 @@ class YukawaColloid : public BaseAtomic { * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, double **host_a, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const double kappa); inline void cast_rad_data(double* rad) { @@ -70,22 +70,22 @@ class YukawaColloid : public BaseAtomic { /// Total host memory used by library for pair style double host_memory_usage() const; - + /// Pair loop with host neighboring - void compute(const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, + void compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *rad); - + /// Pair loop with device neighboring int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, double *rad); // --------------------------- TEXTURES ----------------------------- @@ -101,7 +101,7 @@ class YukawaColloid : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; int _max_rad_size; diff --git a/lib/gpu/lal_yukawa_colloid_ext.cpp b/lib/gpu/lal_yukawa_colloid_ext.cpp index 0e3c653e06..b9ce51e522 100644 --- a/lib/gpu/lal_yukawa_colloid_ext.cpp +++ b/lib/gpu/lal_yukawa_colloid_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -27,10 +27,10 @@ static YukawaColloid YKCOLLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, +int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, double **host_offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, + const double cell_size, int &gpu_mode, FILE *screen, const double kappa) { YKCOLLMF.clear(); gpu_mode=YKCOLLMF.device->gpu_mode(); @@ -54,8 +54,8 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, int init_ok=0; if (world_me==0) - init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, - inum, nall, 300, maxspecial, cell_size, gpu_split, + init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, + inum, nall, 300, maxspecial, cell_size, gpu_split, screen, kappa); YKCOLLMF.device->world_barrier(); @@ -72,12 +72,12 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, - inum, nall, 300, maxspecial, cell_size, gpu_split, + init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, + inum, nall, 300, maxspecial, cell_size, gpu_split, screen, kappa); YKCOLLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -103,11 +103,11 @@ int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_rad); -} +} -void ykcolloid_gpu_compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, +void ykcolloid_gpu_compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_rad) { diff --git a/lib/gpu/lal_yukawa_ext.cpp b/lib/gpu/lal_yukawa_ext.cpp index 1cc89885aa..5136e3ea53 100644 --- a/lib/gpu/lal_yukawa_ext.cpp +++ b/lib/gpu/lal_yukawa_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -28,9 +28,9 @@ static Yukawa YKMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa, - double **host_a, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + double **host_a, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { YKMF.clear(); gpu_mode=YKMF.device->gpu_mode(); @@ -54,8 +54,8 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa, int init_ok=0; if (world_me==0) - init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, - inum, nall, 300, maxspecial, cell_size, + init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, + inum, nall, 300, maxspecial, cell_size, gpu_split, screen); YKMF.device->world_barrier(); @@ -72,12 +72,12 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, - inum, nall, 300, maxspecial, cell_size, + init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, + inum, nall, 300, maxspecial, cell_size, gpu_split, screen); YKMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,7 +102,7 @@ int ** yukawa_gpu_compute_n(const int ago, const int inum_full, return YKMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void yukawa_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, diff --git a/lib/gpu/lal_zbl.cpp b/lib/gpu/lal_zbl.cpp index e172d48b33..a45faf01c3 100644 --- a/lib/gpu/lal_zbl.cpp +++ b/lib/gpu/lal_zbl.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndactrung@gmail.com ***************************************************************************/ @@ -33,10 +33,10 @@ ZBLT::ZBL() : BaseAtomic(), _allocated(false) { } template -ZBLT::~ZBL() { +ZBLT::~ZBL() { clear(); } - + template int ZBLT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -44,15 +44,15 @@ int ZBLT::bytes_per_atom(const int max_nbors) const { template int ZBLT::init(const int ntypes, double **host_cutsq, - double **host_sw1, double **host_sw2, - double **host_sw3, double **host_sw4, + double **host_sw1, double **host_sw2, + double **host_sw3, double **host_sw4, double **host_sw5, - double **host_d1a, double **host_d2a, - double **host_d3a, double **host_d4a, - double **host_zze, double cut_globalsq, + double **host_d1a, double **host_d2a, + double **host_d3a, double **host_d4a, + double **host_zze, double cut_globalsq, double cut_innersq, double cut_inner, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -88,7 +88,7 @@ int ZBLT::init(const int ntypes, double **host_cutsq, coeff3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff3,host_write,host_sw3,host_sw4,host_sw5); - _cut_globalsq = cut_globalsq; + _cut_globalsq = cut_globalsq; _cut_innersq = cut_innersq; _cut_inner = cut_inner; @@ -131,7 +131,7 @@ void ZBLT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); diff --git a/lib/gpu/lal_zbl.cu b/lib/gpu/lal_zbl.cu index b14753b5fa..30bbc8aa2e 100644 --- a/lib/gpu/lal_zbl.cu +++ b/lib/gpu/lal_zbl.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : ndactrung@gmail.com // ***************************************************************************/ @@ -35,9 +35,9 @@ texture pos_tex; compute ZBL pair energy ------------------------------------------------------------------------- */ -ucl_inline numtyp e_zbl(numtyp r, numtyp d1aij, numtyp d2aij, +ucl_inline numtyp e_zbl(numtyp r, numtyp d1aij, numtyp d2aij, numtyp d3aij, numtyp d4aij, numtyp zzeij) { - + numtyp rinv = ucl_recip(r); numtyp sum = c1*ucl_exp(-d1aij*r); @@ -54,7 +54,7 @@ ucl_inline numtyp e_zbl(numtyp r, numtyp d1aij, numtyp d2aij, compute ZBL first derivative ------------------------------------------------------------------------- */ -ucl_inline numtyp dzbldr(numtyp r, numtyp d1aij, numtyp d2aij, +ucl_inline numtyp dzbldr(numtyp r, numtyp d1aij, numtyp d2aij, numtyp d3aij, numtyp d4aij, numtyp zzeij) { numtyp rinv = ucl_recip(r); @@ -72,24 +72,24 @@ ucl_inline numtyp dzbldr(numtyp r, numtyp d1aij, numtyp d2aij, sum_p -= c2*d2aij*e2; sum_p -= c3*d3aij*e3; sum_p -= c4*d4aij*e4; - + numtyp result = zzeij*(sum_p - sum*rinv)*rinv; - + return result; }; -__kernel void k_zbl(const __global numtyp4 *restrict x_, +__kernel void k_zbl(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff3, - const double cut_globalsq, - const double cut_innersq, - const double cut_inner, - const int lj_types, - const __global int *dev_nbor, - const __global int *dev_packed, + const double cut_globalsq, + const double cut_innersq, + const double cut_inner, + const int lj_types, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -101,19 +101,19 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (iicut_innersq) { t = r - cut_inner; force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t); @@ -146,14 +146,14 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_, f.z+=delz*force; if (eflag>0) { - numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, + numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z); e += coeff3[mtype].z; if (rsq > cut_innersq) { e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t); } - energy+=e; + energy+=e; } if (vflag>0) { virial[0] += delx*delx*force; @@ -171,22 +171,22 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_zbl_fast(const __global numtyp4 *restrict x_, +__kernel void k_zbl_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1_in, const __global numtyp4 *restrict coeff2_in, const __global numtyp4 *restrict coeff3_in, - const double cut_globalsq, - const double cut_innersq, - const double cut_inner, + const double cut_globalsq, + const double cut_innersq, + const double cut_inner, const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; @@ -195,7 +195,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_, coeff2[tid]=coeff2_in[tid]; coeff3[tid]=coeff3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -204,7 +204,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (iicut_innersq) { t = r - cut_inner; force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t); @@ -249,14 +249,14 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_, f.z+=delz*force; if (eflag>0) { - numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, + numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z); e += coeff3[mtype].z; if (rsq > cut_innersq) { e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t); } - energy+=e; + energy+=e; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_zbl.h b/lib/gpu/lal_zbl.h index 2996d90a5c..9885fcedf2 100644 --- a/lib/gpu/lal_zbl.h +++ b/lib/gpu/lal_zbl.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndactrung@gmail.com ***************************************************************************/ @@ -24,27 +24,27 @@ template class ZBL : public BaseAtomic { public: ZBL(); - ~ZBL(); + ~ZBL(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, double **host_cutsq, double **host_sw1, + int init(const int ntypes, double **host_cutsq, double **host_sw1, double **host_sw2, double **host_sw3, double **host_sw4, double **host_sw5, - double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a, + double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a, double **host_zze, double cut_globalsq, double cut_innersq, double cut_inner, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -70,8 +70,8 @@ class ZBL : public BaseAtomic { double _cut_globalsq; double _cut_innersq; double _cut_inner; - - /// Number of atom types + + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_zbl_ext.cpp b/lib/gpu/lal_zbl_ext.cpp index ddce858076..5fd003b8ca 100644 --- a/lib/gpu/lal_zbl_ext.cpp +++ b/lib/gpu/lal_zbl_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndactrung@gmail.com ***************************************************************************/ @@ -27,11 +27,11 @@ static ZBL ZBLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, +int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, double **host_sw2, double **host_sw3, double **host_sw4, double **host_sw5, - double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a, + double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a, double **host_zze, double cut_globalsq, double cut_innersq, double cut_inner, - const int inum, const int nall, const int max_nbors, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { ZBLMF.clear(); gpu_mode=ZBLMF.device->gpu_mode(); @@ -55,7 +55,7 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, int init_ok=0; if (world_me==0) - init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, + init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze, cut_globalsq, cut_innersq, cut_inner, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -74,13 +74,13 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, + init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze, - cut_globalsq, cut_innersq, cut_inner, + cut_globalsq, cut_innersq, cut_inner, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); ZBLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -105,7 +105,7 @@ int ** zbl_gpu_compute_n(const int ago, const int inum_full, return ZBLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} +} void zbl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj,