diff --git a/doc/src/pair_snap.rst b/doc/src/pair_snap.rst index d78a68d9f5..1bc17fa8c8 100644 --- a/doc/src/pair_snap.rst +++ b/doc/src/pair_snap.rst @@ -193,17 +193,17 @@ The keywords *chunksize* and *parallelthresh* are only applicable when using the pair style *snap* with the KOKKOS package on GPUs and are ignored otherwise. The *chunksize* keyword controls the number of atoms in each pass used to compute the bispectrum components and is used to -avoid running out of memory. For example if there are 8192 atoms in the +avoid running out of memory. For example if there are 8192 atoms in the simulation and the *chunksize* is set to 4096, the bispectrum calculation will be broken up into two passes (running on a single GPU). The *parallelthresh* keyword controls a crossover threshold for -performing extra parallelism. For small systems, exposing additional +performing extra parallelism. For small systems, exposing additional parallelism can be beneficial when there is not enough work to fully saturate the GPU threads otherwise. However, the extra parallelism also leads to more divergence and can hurt performance when the system is -already large enough to saturate the GPU threads. Extra parallelism will -be performed if the *chunksize* (or total number of atoms per GPU) is -smaller than *parallelthresh*. +already large enough to saturate the GPU threads. Extra parallelism +will be performed if the *chunksize* (or total number of atoms per GPU) +is smaller than *parallelthresh*. Detailed definitions for all the other keywords are given on the :doc:`compute sna/atom ` doc page. diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index 9045420425..84fbddd4e9 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -56,7 +56,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_name) { + const char *k_name, const int disable_fast_math) { screen=_screen; int gpu_nbor=0; @@ -83,7 +83,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name); + compile_kernels(*ucl_device,pair_program,k_name,disable_fast_math); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -321,14 +321,20 @@ double BaseChargeT::host_memory_usage_atomic() const { template void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str, - const char *kname) { + const char *kname, + const int disable_fast_math) { if (_compiled) return; std::string s_fast=std::string(kname)+"_fast"; if (pair_program) delete pair_program; pair_program=new UCL_Program(dev); - std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + std::string device_compile_string; + if (disable_fast_math) + device_compile_string = device->compile_string_nofast(); + else + device_compile_string = device->compile_string(); + std::string oclstring = device_compile_string+" -DEVFLAG=1"; pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair.set_function(*pair_program,kname); @@ -336,7 +342,7 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str, q_tex.get_texture(*pair_program,"q_tex"); #if defined(LAL_OCL_EV_JIT) - oclstring = device->compile_string()+" -DEVFLAG=0"; + oclstring = device_compile_string+" -DEVFLAG=0"; if (pair_program_noev) delete pair_program_noev; pair_program_noev=new UCL_Program(dev); pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen); diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h index 6b8761092a..307c5c079f 100644 --- a/lib/gpu/lal_base_charge.h +++ b/lib/gpu/lal_base_charge.h @@ -44,6 +44,7 @@ class BaseCharge { * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * \param k_name name for the kernel for force calculation + * \param disable_fast_math override any fast math opts for kernel JIT * * Returns: * - 0 if successful @@ -54,7 +55,8 @@ class BaseCharge { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, - const void *pair_program, const char *k_name); + const void *pair_program, const char *k_name, + const int disable_fast_math = 0); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -198,7 +200,8 @@ class BaseCharge { double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; - void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); + void compile_kernels(UCL_Device &dev, const void *pair_string, + const char *k, const int disable_fast_math); virtual int loop(const int eflag, const int vflag) = 0; }; diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp index 2e22b2f602..fa060bea5a 100644 --- a/lib/gpu/lal_base_ellipsoid.cpp +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -224,7 +224,9 @@ void BaseEllipsoidT::output_times() { #ifdef USE_OPENCL // Workaround for timing issue on Intel OpenCL + if (times[0] > 80e6) times[0]=0.0; if (times[3] > 80e6) times[3]=0.0; + if (times[6] > 80e6) times[6]=0.0; #endif if (device->replica_me()==0) @@ -237,17 +239,18 @@ void BaseEllipsoidT::output_times() { fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (device->procs_per_gpu()==1 && times[3]>0) { - fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size); + if (device->procs_per_gpu()==1 && (times[3] > 0.0)) { + if (times[0] > 0.0) + fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size); - if (nbor->gpu_nbor()>0) + if (nbor->gpu_nbor() > 0.0) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size); else fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size); fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size); } - if (times[6]>0) + if (times[6] > 0.0) fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Lanes / atom: %d.\n",_threads_per_atom); diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp index e6caebbab8..9aac866353 100644 --- a/lib/gpu/lal_born_coul_wolf.cpp +++ b/lib/gpu/lal_born_coul_wolf.cpp @@ -57,7 +57,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho const double alf, const double e_shift, const double f_shift) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,born_coul_wolf,"k_born_coul_wolf"); + _screen,born_coul_wolf,"k_born_coul_wolf",1); if (success!=0) return success; diff --git a/lib/gpu/lal_born_coul_wolf_cs.cpp b/lib/gpu/lal_born_coul_wolf_cs.cpp index 8deceeb1f4..abd4da439a 100644 --- a/lib/gpu/lal_born_coul_wolf_cs.cpp +++ b/lib/gpu/lal_born_coul_wolf_cs.cpp @@ -42,7 +42,7 @@ int BornCoulWolfCST::init(const int ntypes, double **host_cutsq, double **host_r const double alf, const double e_shift, const double f_shift) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,born_coul_wolf_cs,"k_born_coul_wolf_cs"); + _screen,born_coul_wolf_cs,"k_born_coul_wolf_cs",1); if (success!=0) return success; diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index e2b5b9cdb5..e43e77a761 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -420,6 +420,16 @@ int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) { return 0; } +template +std::string DeviceT::compile_string_nofast() { + std::string no_fast = _ocl_compile_string; + size_t p = no_fast.find("-cl-fast-relaxed-math "); + if (p != std::string::npos) no_fast.erase(p,22); + p = no_fast.find("-DFAST_MATH="); + if (p != std::string::npos) no_fast[p + 12]='0'; + return no_fast; +} + template int DeviceT::init(Answer &ans, const bool charge, const bool rot, const int nlocal, @@ -777,28 +787,30 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, #ifdef USE_OPENCL // Workaround for timing issue on Intel OpenCL + if (times[0] > 80e6) times[0]=0.0; if (times[3] > 80e6) times[3]=0.0; if (times[5] > 80e6) times[5]=0.0; #endif if (replica_me()==0) - if (screen && times[6]>0.0) { + if (screen && (times[6] > 0.0)) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); fprintf(screen," Device Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (time_device() && times[3]>0) { - fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); + if (time_device() && (times[3] > 0.0)) { + if (times[0] > 0.0) + fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); - if (nbor.gpu_nbor()>0) + if (nbor.gpu_nbor() > 0.0) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size); else fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); } - if (times[5]>0) + if (times[5] > 0.0) fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom); diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 1db6ae3127..933a3508b5 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -312,6 +312,7 @@ class Device { } inline std::string compile_string() { return _ocl_compile_string; } + std::string compile_string_nofast(); inline std::string ocl_config_name() { return _ocl_config_name; } template diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 013d2b3ede..2a984eefb6 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -406,8 +406,8 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POW KOKKOS_INTERNAL_USE_TM := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX)) # Incompatible flags? -KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc )) -KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc)) +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc ) +KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) $(error Defined Multiple Host architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) ) diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 71ab3f4cb4..843bff2a35 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -13,25 +13,26 @@ ------------------------------------------------------------------------- */ #include "fix_gpu.h" -#include #include "atom.h" +#include "citeme.h" #include "comm.h" +#include "domain.h" +#include "error.h" #include "force.h" +#include "gpu_extra.h" +#include "input.h" +#include "modify.h" +#include "neighbor.h" #include "pair.h" #include "pair_hybrid.h" #include "pair_hybrid_overlay.h" #include "respa.h" -#include "input.h" #include "timer.h" -#include "modify.h" -#include "update.h" -#include "domain.h" #include "universe.h" -#include "gpu_extra.h" -#include "neighbor.h" -#include "citeme.h" -#include "error.h" +#include "update.h" + +#include #if (LAL_USE_OMP == 1) #include @@ -275,12 +276,15 @@ void FixGPU::init() error->warning(FLERR,"Using package gpu without any pair style defined"); // make sure fdotr virial is not accumulated multiple times + // also disallow GPU neighbor lists for hybrid styles if (force->pair_match("^hybrid",0) != nullptr) { PairHybrid *hybrid = (PairHybrid *) force->pair; for (int i = 0; i < hybrid->nstyles; i++) if (!utils::strmatch(hybrid->keywords[i],"/gpu$")) force->pair->no_virial_fdotr_compute = 1; + if (_gpu_mode != GPU_FORCE) + error->all(FLERR, "Must not use GPU neighbor lists with hybrid pair style"); } // rRESPA support @@ -295,8 +299,7 @@ void FixGPU::setup(int vflag) { if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) if (neighbor->exclude_setting() != 0) - error->all(FLERR, - "Cannot use neigh_modify exclude with GPU neighbor builds"); + error->all(FLERR, "Cannot use neigh_modify exclude with GPU neighbor builds"); if (utils::strmatch(update->integrate_style,"^verlet")) post_force(vflag); else { diff --git a/src/GRANULAR/fix_freeze.h b/src/GRANULAR/fix_freeze.h index 2fc5fda71e..5846bfd769 100644 --- a/src/GRANULAR/fix_freeze.h +++ b/src/GRANULAR/fix_freeze.h @@ -30,7 +30,7 @@ class FixFreeze : public Fix { int setmask(); void init(); void setup(int); - void post_force(int); + virtual void post_force(int); void post_force_respa(int, int, int); double compute_vector(int); diff --git a/src/INTEL/intel_preprocess.h b/src/INTEL/intel_preprocess.h index 27daa5f3d2..c7bd60b00b 100644 --- a/src/INTEL/intel_preprocess.h +++ b/src/INTEL/intel_preprocess.h @@ -20,10 +20,6 @@ #define USE_OMP_SIMD #define __INTEL_COMPILER __INTEL_LLVM_COMPILER #define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER -#define _MM_SCALE_1 1 -#define _MM_SCALE_2 2 -#define _MM_SCALE_4 4 -#define _MM_SCALE_8 8 #endif #ifdef __INTEL_COMPILER diff --git a/src/INTEL/intel_simd.h b/src/INTEL/intel_simd.h index 2affa6a394..d75b2b9175 100644 --- a/src/INTEL/intel_simd.h +++ b/src/INTEL/intel_simd.h @@ -35,6 +35,13 @@ authors for more details. #ifdef __AVX512F__ +#ifndef _MM_SCALE_1 +#define _MM_SCALE_1 1 +#define _MM_SCALE_2 2 +#define _MM_SCALE_4 4 +#define _MM_SCALE_8 8 +#endif + namespace ip_simd { typedef __mmask16 SIMD_mask; diff --git a/src/KOKKOS/compute_temp_deform_kokkos.h b/src/KOKKOS/compute_temp_deform_kokkos.h index 8b53c1f633..0292c6776d 100644 --- a/src/KOKKOS/compute_temp_deform_kokkos.h +++ b/src/KOKKOS/compute_temp_deform_kokkos.h @@ -1,4 +1,3 @@ -// clang-format off /* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories @@ -20,6 +19,7 @@ ComputeStyle(temp/deform/kk/host,ComputeTempDeformKokkos); // clang-format on #else +// clang-format off #ifndef LMP_COMPUTE_TEMP_DEFORM_KOKKOS_H #define LMP_COMPUTE_TEMP_DEFORM_KOKKOS_H diff --git a/src/KOKKOS/fix_freeze_kokkos.cpp b/src/KOKKOS/fix_freeze_kokkos.cpp index 190a054f8b..9a486a3fcc 100644 --- a/src/KOKKOS/fix_freeze_kokkos.cpp +++ b/src/KOKKOS/fix_freeze_kokkos.cpp @@ -28,41 +28,16 @@ FixFreezeKokkos::FixFreezeKokkos(LAMMPS *lmp, int narg, char **arg) atomKK = (AtomKokkos *)atom; execution_space = ExecutionSpaceFromDevice::space; - datamask_read = F_MASK | MASK_MASK; + datamask_read = F_MASK | MASK_MASK | TORQUE_MASK; datamask_modify = F_MASK | TORQUE_MASK; } /* ---------------------------------------------------------------------- */ -template -int FixFreezeKokkos::setmask() -{ - return FixFreeze::setmask(); -} - -/* ---------------------------------------------------------------------- */ - -template -void FixFreezeKokkos::init() -{ - FixFreeze::init(); -} - -/* ---------------------------------------------------------------------- */ - -template -void FixFreezeKokkos::setup(int vflag) -{ - FixFreeze::setup(vflag); -} - -/* ---------------------------------------------------------------------- */ - template void FixFreezeKokkos::post_force(int /*vflag*/) { atomKK->sync(execution_space,datamask_read); - atomKK->modified(execution_space,datamask_modify); f = atomKK->k_f.view(); torque = atomKK->k_torque.view(); @@ -80,28 +55,10 @@ void FixFreezeKokkos::post_force(int /*vflag*/) foriginal[0] = original.values[0]; foriginal[1] = original.values[1]; foriginal[2] = original.values[2]; + + atomKK->modified(execution_space,datamask_modify); } -/* ---------------------------------------------------------------------- */ - -template -void FixFreezeKokkos::post_force_respa(int vflag, int /*ilevel*/, int /*iloop*/) -{ - post_force(vflag); -} - -/* ---------------------------------------------------------------------- - return components of total force on fix group before force was changed -------------------------------------------------------------------------- */ - -template -double FixFreezeKokkos::compute_vector(int n) -{ - return FixFreeze::compute_vector(n); -} - -/* ---------------------------------------------------------------------- */ - template KOKKOS_INLINE_FUNCTION void FixFreezeKokkos::operator()(const int i, OriginalForce &original) const { diff --git a/src/KOKKOS/fix_freeze_kokkos.h b/src/KOKKOS/fix_freeze_kokkos.h index dcfc14bd3d..67d4f3272c 100644 --- a/src/KOKKOS/fix_freeze_kokkos.h +++ b/src/KOKKOS/fix_freeze_kokkos.h @@ -31,6 +31,7 @@ namespace LAMMPS_NS { template class FixFreezeKokkos : public FixFreeze { public: + typedef DeviceType device_type; struct OriginalForce { double values[3]; @@ -58,12 +59,7 @@ class FixFreezeKokkos : public FixFreeze { }; FixFreezeKokkos(class LAMMPS *, int, char **); - int setmask(); - void init(); - void setup(int); void post_force(int); - void post_force_respa(int, int, int); - double compute_vector(int); KOKKOS_INLINE_FUNCTION void operator()(const int i, OriginalForce &original) const; diff --git a/src/KOKKOS/fix_neigh_history_kokkos.cpp b/src/KOKKOS/fix_neigh_history_kokkos.cpp index 611a8a40ef..4837148ee0 100644 --- a/src/KOKKOS/fix_neigh_history_kokkos.cpp +++ b/src/KOKKOS/fix_neigh_history_kokkos.cpp @@ -87,6 +87,9 @@ void FixNeighHistoryKokkos::pre_exchange() { copymode = 1; + k_firstflag.sync(); + k_firstvalue.sync(); + h_resize() = 1; while (h_resize() > 0) { FixNeighHistoryKokkosZeroPartnerCountFunctor zero(this); @@ -168,6 +171,9 @@ void FixNeighHistoryKokkos::post_neighbor() { tag = atomKK->k_tag.view(); + k_firstflag.sync(); + k_firstvalue.sync(); + int inum = pair->list->inum; NeighListKokkos* k_list = static_cast*>(pair->list); d_numneigh = k_list->d_numneigh; @@ -185,8 +191,10 @@ void FixNeighHistoryKokkos::post_neighbor() if (maxatom < nlocal || k_list->maxneighs > (int)d_firstflag.extent(1)) { maxatom = nall; - d_firstflag = Kokkos::View("neighbor_history:firstflag",maxatom,k_list->maxneighs); - d_firstvalue = Kokkos::View("neighbor_history:firstvalue",maxatom,k_list->maxneighs*dnum); + k_firstflag = DAT::tdual_int_2d("neighbor_history:firstflag",maxatom,k_list->maxneighs); + k_firstvalue = DAT::tdual_float_2d("neighbor_history:firstvalue",maxatom,k_list->maxneighs*dnum); + d_firstflag = k_firstflag.view(); + d_firstvalue = k_firstvalue.view(); } copymode = 1; @@ -194,6 +202,9 @@ void FixNeighHistoryKokkos::post_neighbor() FixNeighHistoryKokkosPostNeighborFunctor f(this); Kokkos::parallel_for(inum,f); + k_firstflag.modify(); + k_firstvalue.modify(); + copymode = 0; } diff --git a/src/KOKKOS/fix_neigh_history_kokkos.h b/src/KOKKOS/fix_neigh_history_kokkos.h index acc1e9c408..0442b46cbd 100644 --- a/src/KOKKOS/fix_neigh_history_kokkos.h +++ b/src/KOKKOS/fix_neigh_history_kokkos.h @@ -50,10 +50,13 @@ class FixNeighHistoryKokkos : public FixNeighHistory { KOKKOS_INLINE_FUNCTION void post_neighbor_item(const int &ii) const; - typename Kokkos::View d_firstflag; - typename Kokkos::View d_firstvalue; + typename DAT::tdual_int_2d k_firstflag; + typename DAT::tdual_float_2d k_firstvalue; private: + typename ArrayTypes::t_int_2d d_firstflag; + typename ArrayTypes::t_float_2d d_firstvalue; + typename ArrayTypes::tdual_int_1d k_npartner; typename ArrayTypes::tdual_tagint_2d k_partner; typename ArrayTypes::tdual_float_2d k_valuepartner; @@ -74,6 +77,7 @@ class FixNeighHistoryKokkos : public FixNeighHistory { template struct FixNeighHistoryKokkosZeroPartnerCountFunctor { + typedef DeviceType device_type; FixNeighHistoryKokkos c; FixNeighHistoryKokkosZeroPartnerCountFunctor(FixNeighHistoryKokkos *c_ptr): c(*c_ptr) {} KOKKOS_INLINE_FUNCTION @@ -84,6 +88,7 @@ struct FixNeighHistoryKokkosZeroPartnerCountFunctor { template struct FixNeighHistoryKokkosPreExchangeFunctor { + typedef DeviceType device_type; FixNeighHistoryKokkos c; FixNeighHistoryKokkosPreExchangeFunctor(FixNeighHistoryKokkos *c_ptr): c(*c_ptr) {} KOKKOS_INLINE_FUNCTION @@ -94,6 +99,7 @@ struct FixNeighHistoryKokkosPreExchangeFunctor { template struct FixNeighHistoryKokkosPostNeighborFunctor { + typedef DeviceType device_type; FixNeighHistoryKokkos c; FixNeighHistoryKokkosPostNeighborFunctor(FixNeighHistoryKokkos *c_ptr): c(*c_ptr) {} KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/fix_nve_sphere_kokkos.cpp b/src/KOKKOS/fix_nve_sphere_kokkos.cpp index 787171b6ce..12b170acee 100644 --- a/src/KOKKOS/fix_nve_sphere_kokkos.cpp +++ b/src/KOKKOS/fix_nve_sphere_kokkos.cpp @@ -31,8 +31,8 @@ FixNVESphereKokkos::FixNVESphereKokkos(LAMMPS *lmp, int narg, char * atomKK = (AtomKokkos *)atom; execution_space = ExecutionSpaceFromDevice::space; - datamask_read = F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK; - datamask_modify = X_MASK | V_MASK | OMEGA_MASK; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; } /* ---------------------------------------------------------------------- */ @@ -61,8 +61,7 @@ void FixNVESphereKokkos::init() template void FixNVESphereKokkos::initial_integrate(int /*vflag*/) { - atomKK->sync(execution_space,datamask_read); - atomKK->modified(execution_space,datamask_modify); + atomKK->sync(execution_space, X_MASK | V_MASK | OMEGA_MASK| F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK); x = atomKK->k_x.view(); v = atomKK->k_v.view(); @@ -78,6 +77,8 @@ void FixNVESphereKokkos::initial_integrate(int /*vflag*/) FixNVESphereKokkosInitialIntegrateFunctor f(this); Kokkos::parallel_for(nlocal,f); + + atomKK->modified(execution_space, X_MASK | V_MASK | OMEGA_MASK); } /* ---------------------------------------------------------------------- */ @@ -109,8 +110,7 @@ void FixNVESphereKokkos::initial_integrate_item(const int i) const template void FixNVESphereKokkos::final_integrate() { - atomKK->sync(execution_space,datamask_read); - atomKK->modified(execution_space,datamask_modify); + atomKK->sync(execution_space, V_MASK | OMEGA_MASK| F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK); v = atomKK->k_v.view(); omega = atomKK->k_omega.view(); @@ -125,6 +125,8 @@ void FixNVESphereKokkos::final_integrate() FixNVESphereKokkosFinalIntegrateFunctor f(this); Kokkos::parallel_for(nlocal,f); + + atomKK->modified(execution_space, V_MASK | OMEGA_MASK); } /* ---------------------------------------------------------------------- */ diff --git a/src/KOKKOS/fix_nve_sphere_kokkos.h b/src/KOKKOS/fix_nve_sphere_kokkos.h index 888a1baa0d..f3e3df13d4 100644 --- a/src/KOKKOS/fix_nve_sphere_kokkos.h +++ b/src/KOKKOS/fix_nve_sphere_kokkos.h @@ -56,6 +56,7 @@ class FixNVESphereKokkos : public FixNVESphere { template struct FixNVESphereKokkosInitialIntegrateFunctor { + typedef DeviceType device_type; FixNVESphereKokkos c; FixNVESphereKokkosInitialIntegrateFunctor(FixNVESphereKokkos *c_ptr): c(*c_ptr) { c.cleanup_copy(); } KOKKOS_INLINE_FUNCTION @@ -66,6 +67,7 @@ struct FixNVESphereKokkosInitialIntegrateFunctor { template struct FixNVESphereKokkosFinalIntegrateFunctor { + typedef DeviceType device_type; FixNVESphereKokkos c; FixNVESphereKokkosFinalIntegrateFunctor(FixNVESphereKokkos *c_ptr): c(*c_ptr) { c.cleanup_copy(); } KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp index d0af72f17f..5ba0e6b666 100644 --- a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp +++ b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp @@ -65,8 +65,6 @@ void FixNVTSllodKokkos::init() { FixNHKokkos::init(); - vdelu = typename ArrayTypes::t_v_array("nvt/sllod/kk:vdelu", atomKK->nlocal); - if (!this->temperature->tempbias) this->error->all(FLERR,"Temperature for fix nvt/sllod does not have a bias"); @@ -100,7 +98,7 @@ void FixNVTSllodKokkos::nh_v_temp() // calculate temperature since some computes require temp // computed on current nlocal atoms to remove bias - if (nondeformbias){ + if (nondeformbias) { atomKK->sync(this->temperature->execution_space,this->temperature->datamask_read); this->temperature->compute_scalar(); atomKK->modified(this->temperature->execution_space,this->temperature->datamask_modify); @@ -115,6 +113,9 @@ void FixNVTSllodKokkos::nh_v_temp() d_h_two = Few(h_two); + if (vdelu.extent(0) < atomKK->nmax) + vdelu = typename AT::t_v_array(Kokkos::NoInit("nvt/sllod/kk:vdelu"), atomKK->nmax); + this->copymode = 1; Kokkos::parallel_for(Kokkos::RangePolicy(0,nlocal),*this); this->copymode = 0; diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.h b/src/KOKKOS/fix_nvt_sllod_kokkos.h index 6057ce44d0..84e57ab2c3 100644 --- a/src/KOKKOS/fix_nvt_sllod_kokkos.h +++ b/src/KOKKOS/fix_nvt_sllod_kokkos.h @@ -35,6 +35,9 @@ struct TagFixNVTSllod_temp2{}; template class FixNVTSllodKokkos : public FixNHKokkos { public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + FixNVTSllodKokkos(class LAMMPS *, int, char **); ~FixNVTSllodKokkos() {} void init(); @@ -51,14 +54,14 @@ class FixNVTSllodKokkos : public FixNHKokkos { void nh_v_temp(); protected: - typename ArrayTypes::t_x_array x; - typename ArrayTypes::t_v_array v; - typename ArrayTypes::t_v_array vdelu; - typename ArrayTypes::t_f_array_const f; - typename ArrayTypes::t_float_1d rmass; - typename ArrayTypes::t_float_1d mass; - typename ArrayTypes::t_int_1d type; - typename ArrayTypes::t_int_1d mask; + typename AT::t_x_array x; + typename AT::t_v_array v; + typename AT::t_v_array vdelu; + typename AT::t_f_array_const f; + typename AT::t_float_1d rmass; + typename AT::t_float_1d mass; + typename AT::t_int_1d type; + typename AT::t_int_1d mask; Few d_h_two; diff --git a/src/KOKKOS/fix_property_atom_kokkos.cpp b/src/KOKKOS/fix_property_atom_kokkos.cpp index 841b791c5c..97931f5d67 100644 --- a/src/KOKKOS/fix_property_atom_kokkos.cpp +++ b/src/KOKKOS/fix_property_atom_kokkos.cpp @@ -45,23 +45,23 @@ FixPropertyAtomKokkos::FixPropertyAtomKokkos(LAMMPS *lmp, int narg, char **arg) void FixPropertyAtomKokkos::grow_arrays(int nmax) { for (int m = 0; m < nvalue; m++) { - if (style[m] == MOLECULE) { + if (styles[m] == MOLECULE) { memory->grow(atom->molecule,nmax,"atom:molecule"); size_t nbytes = (nmax-nmax_old) * sizeof(tagint); memset(&atom->molecule[nmax_old],0,nbytes); - } else if (style[m] == CHARGE) { + } else if (styles[m] == CHARGE) { memory->grow(atom->q,nmax,"atom:q"); size_t nbytes = (nmax-nmax_old) * sizeof(double); memset(&atom->q[nmax_old],0,nbytes); - } else if (style[m] == RMASS) { + } else if (styles[m] == RMASS) { memory->grow(atom->rmass,nmax,"atom:rmass"); size_t nbytes = (nmax-nmax_old) * sizeof(double); memset(&atom->rmass[nmax_old],0,nbytes); - } else if (style[m] == INTEGER) { + } else if (styles[m] == INTEGER) { memory->grow(atom->ivector[index[m]],nmax,"atom:ivector"); size_t nbytes = (nmax-nmax_old) * sizeof(int); memset(&atom->ivector[index[m]][nmax_old],0,nbytes); - } else if (style[m] == DOUBLE) { + } else if (styles[m] == DOUBLE) { atomKK->sync(Device,DVECTOR_MASK); memoryKK->grow_kokkos(atomKK->k_dvector,atomKK->dvector,atomKK->k_dvector.extent(0),nmax, "atom:dvector"); diff --git a/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp b/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp index fab33d0ec7..b47b5f5a47 100644 --- a/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp +++ b/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp @@ -165,8 +165,11 @@ void PairGranHookeHistoryKokkos::compute(int eflag_in, int vflag_in) d_neighbors.extent(1) != d_neighbors_touch.extent(1)) d_neighbors_touch = typename AT::t_neighbors_2d("pair:neighbors_touch",d_neighbors.extent(0),d_neighbors.extent(1)); - d_firsttouch = fix_historyKK->d_firstflag; - d_firstshear = fix_historyKK->d_firstvalue; + fix_historyKK->k_firstflag.template sync(); + fix_historyKK->k_firstvalue.template sync(); + + d_firsttouch = fix_historyKK->k_firstflag.template view(); + d_firstshear = fix_historyKK->k_firstvalue.template view(); Kokkos::parallel_for(Kokkos::RangePolicy(0,inum),*this); @@ -258,6 +261,11 @@ void PairGranHookeHistoryKokkos::compute(int eflag_in, int vflag_in) } } + if (eflag_atom) { + k_eatom.template modify(); + k_eatom.template sync(); + } + if (vflag_global) { virial[0] += ev.v[0]; virial[1] += ev.v[1]; diff --git a/src/KOKKOS/pair_gran_hooke_history_kokkos.h b/src/KOKKOS/pair_gran_hooke_history_kokkos.h index 6b887c0df4..37fb208a70 100644 --- a/src/KOKKOS/pair_gran_hooke_history_kokkos.h +++ b/src/KOKKOS/pair_gran_hooke_history_kokkos.h @@ -92,8 +92,8 @@ class PairGranHookeHistoryKokkos : public PairGranHookeHistory { typename AT::t_int_1d_randomread d_ilist; typename AT::t_int_1d_randomread d_numneigh; - typename Kokkos::View d_firsttouch; - typename Kokkos::View d_firstshear; + typename AT::t_int_2d d_firsttouch; + typename AT::t_float_2d d_firstshear; typename AT::t_neighbors_2d d_neighbors_touch; typename AT::t_int_1d d_numneigh_touch; diff --git a/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp b/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp index 66064d58b2..982ec9d99e 100644 --- a/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp +++ b/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp @@ -69,7 +69,7 @@ PairLJCharmmCoulCharmmKokkos::~PairLJCharmmCoulCharmmKokkos() if (allocated) { memoryKK->destroy_kokkos(k_eatom,eatom); memoryKK->destroy_kokkos(k_vatom,vatom); - k_cutsq = DAT::tdual_ffloat_2d(); + memoryKK->destroy_kokkos(k_cutsq,cutsq); } } diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index a7f58f2525..d71d7d1bad 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -1,3 +1,4 @@ +// clang-format off /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp index 909c6930cf..5e9ab757c6 100644 --- a/src/KOKKOS/verlet_kokkos.cpp +++ b/src/KOKKOS/verlet_kokkos.cpp @@ -92,40 +92,22 @@ void VerletKokkos::setup(int flag) // acquire ghosts // build neighbor lists - atomKK->sync(Host,ALL_MASK); - atomKK->modified(Host,ALL_MASK); + lmp->kokkos->auto_sync = 1; - atomKK->setup(); + atom->setup(); modify->setup_pre_exchange(); - // debug - atomKK->sync(Host,ALL_MASK); - atomKK->modified(Host,ALL_MASK); - if (triclinic) domain->x2lamda(atomKK->nlocal); + if (triclinic) domain->x2lamda(atom->nlocal); domain->pbc(); - - atomKK->sync(Host,ALL_MASK); - - domain->reset_box(); comm->setup(); if (neighbor->style) neighbor->setup_bins(); - comm->exchange(); - - if (atomKK->sortfreq > 0) atomKK->sort(); - + if (atom->sortfreq > 0) atom->sort(); comm->borders(); - - if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost); - - atomKK->sync(Host,ALL_MASK); - + if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost); domain->image_check(); domain->box_too_small_check(); modify->setup_pre_neighbor(); - - atomKK->modified(Host,ALL_MASK); - neighbor->build(1); modify->setup_post_neighbor(); neighbor->ncalls = 0; @@ -144,7 +126,7 @@ void VerletKokkos::setup(int flag) } else if (force->pair) force->pair->compute_dummy(eflag,vflag); - if (atomKK->molecular != Atom::ATOMIC) { + if (atom->molecular != Atom::ATOMIC) { if (force->bond) { atomKK->sync(force->bond->execution_space,force->bond->datamask_read); force->bond->compute(eflag,vflag); @@ -200,35 +182,21 @@ void VerletKokkos::setup_minimal(int flag) // acquire ghosts // build neighbor lists + lmp->kokkos->auto_sync = 1; + if (flag) { - atomKK->sync(Host,ALL_MASK); - atomKK->modified(Host,ALL_MASK); - modify->setup_pre_exchange(); - // debug - atomKK->sync(Host,ALL_MASK); - atomKK->modified(Host,ALL_MASK); - - if (triclinic) domain->x2lamda(atomKK->nlocal); + if (triclinic) domain->x2lamda(atom->nlocal); domain->pbc(); - - atomKK->sync(Host,ALL_MASK); - domain->reset_box(); comm->setup(); if (neighbor->style) neighbor->setup_bins(); comm->exchange(); comm->borders(); - if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost); - - atomKK->sync(Host,ALL_MASK); - + if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost); domain->image_check(); domain->box_too_small_check(); modify->setup_pre_neighbor(); - - atomKK->modified(Host,ALL_MASK); - neighbor->build(1); modify->setup_post_neighbor(); neighbor->ncalls = 0; @@ -247,7 +215,7 @@ void VerletKokkos::setup_minimal(int flag) } else if (force->pair) force->pair->compute_dummy(eflag,vflag); - if (atomKK->molecular != Atom::ATOMIC) { + if (atom->molecular != Atom::ATOMIC) { if (force->bond) { atomKK->sync(force->bond->execution_space,force->bond->datamask_read); force->bond->compute(eflag,vflag); diff --git a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi index 3971cc6c06..c6071cf747 100644 --- a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi +++ b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi @@ -23,7 +23,7 @@ ARCHIVE = ar ARFLAGS = -rc SHLIBFLAGS = -shared KOKKOS_DEVICES = Cuda -KOKKOS_ARCH = Kepler35 +KOKKOS_ARCH = Volta70 # --------------------------------------------------------------------- # LAMMPS-specific settings, all OPTIONAL diff --git a/src/REACTION/fix_bond_react.cpp b/src/REACTION/fix_bond_react.cpp index 192331889d..7b891d42fe 100644 --- a/src/REACTION/fix_bond_react.cpp +++ b/src/REACTION/fix_bond_react.cpp @@ -1990,7 +1990,10 @@ int FixBondReact::check_constraints() *ptr = satisfied[i] ? '1' : '0'; } double verdict = input->variable->evaluate_boolean(evalstr); - if (verdict == 0.0) return 0; + if (verdict == 0.0) { + memory->destroy(satisfied); + return 0; + } } // let's also check chirality within 'check_constraint' @@ -2012,7 +2015,10 @@ int FixBondReact::check_constraints() } } } - if (get_chirality(my4coords) != chiral_atoms[i][1][rxnID]) return 0; + if (get_chirality(my4coords) != chiral_atoms[i][1][rxnID]) { + memory->destroy(satisfied); + return 0; + } } }