From d6316c40d94efe8b0259cea2ce71bc101d5a6bcc Mon Sep 17 00:00:00 2001 From: Christoph Junghans Date: Fri, 22 Sep 2017 15:17:44 -0600 Subject: [PATCH 01/53] cmake: fix build with system latte --- cmake/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index bc33da60de..666b77ae3d 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -665,7 +665,9 @@ include_directories(${LAMMPS_STYLE_HEADERS_DIR}) ############################################ add_library(lammps ${LIB_SOURCES}) target_link_libraries(lammps ${LAMMPS_LINK_LIBS}) -add_dependencies(lammps ${LAMMPS_DEPS}) +if(LAMMPS_DEPS) + add_dependencies(lammps ${LAMMPS_DEPS}) +endif() set_target_properties(lammps PROPERTIES OUTPUT_NAME lammps${LAMMPS_MACHINE}) if(BUILD_SHARED_LIBS) set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION}) From 78a486c0fdfe0b8a8fa02c1e8bfac9c4bda751a7 Mon Sep 17 00:00:00 2001 From: Julien Devemy Date: Mon, 25 Sep 2017 16:18:08 +0200 Subject: [PATCH 02/53] Authorize hybrid/overlay for fix srp --- src/USER-MISC/fix_srp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/USER-MISC/fix_srp.cpp b/src/USER-MISC/fix_srp.cpp index f3dec42a83..e1e5f579b8 100644 --- a/src/USER-MISC/fix_srp.cpp +++ b/src/USER-MISC/fix_srp.cpp @@ -98,7 +98,7 @@ int FixSRP::setmask() void FixSRP::init() { - if (force->pair_match("hybrid",1) == NULL) + if (force->pair_match("hybrid",1) == NULL && force->pair_match("hybrid/overlay",1) == NULL) error->all(FLERR,"Cannot use pair srp without pair_style hybrid"); int has_rigid = 0; From 789812ec3dcef78579ee42958d3c24a3b7792b3b Mon Sep 17 00:00:00 2001 From: Tim Mattox Date: Fri, 22 Sep 2017 14:46:53 -0500 Subject: [PATCH 03/53] KOKKOS: minor typo fix --- src/KOKKOS/npair_kokkos.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index b568bd5c93..8d6648bf2b 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -164,8 +164,8 @@ void NPairKokkos::build(NeighList *list_) k_ex_mol_group.sync(); k_ex_mol_bit.sync(); k_ex_mol_intra.sync(); - k_bincount.sync(), - k_bins.sync(), + k_bincount.sync(); + k_bins.sync(); atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK); data.special_flag[0] = special_flag[0]; From 32e0de7a67a1c17d3b4f948847ca3e4b6e35e5cb Mon Sep 17 00:00:00 2001 From: Tim Mattox Date: Fri, 22 Sep 2017 15:10:53 -0500 Subject: [PATCH 04/53] first pass at implementing atom2bin for KOKKOS neighbor lists --- src/KOKKOS/nbin_kokkos.cpp | 5 +++++ src/KOKKOS/nbin_kokkos.h | 2 ++ src/KOKKOS/npair_kokkos.cpp | 7 +++++-- src/KOKKOS/npair_kokkos.h | 37 +++++-------------------------------- 4 files changed, 17 insertions(+), 34 deletions(-) diff --git a/src/KOKKOS/nbin_kokkos.cpp b/src/KOKKOS/nbin_kokkos.cpp index c7e815928a..b06d46d520 100644 --- a/src/KOKKOS/nbin_kokkos.cpp +++ b/src/KOKKOS/nbin_kokkos.cpp @@ -75,6 +75,10 @@ void NBinKokkos::bin_atoms_setup(int nall) k_bincount = DAT::tdual_int_1d("Neighbor::d_bincount",mbins); bincount = k_bincount.view(); } + if (nall > k_atom2bin.d_view.dimension_0()) { + k_atom2bin = DAT::tdual_int_1d("Neighbor::d_atom2bin",nall); + atom2bin = k_atom2bin.view(); + } } /* ---------------------------------------------------------------------- @@ -125,6 +129,7 @@ void NBinKokkos::binatomsItem(const int &i) const { const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2)); + atom2bin(i) = ibin; const int ac = Kokkos::atomic_fetch_add(&bincount[ibin], (int)1); if(ac < bins.dimension_1()) { bins(ibin, ac) = i; diff --git a/src/KOKKOS/nbin_kokkos.h b/src/KOKKOS/nbin_kokkos.h index de3cf41d19..bf2ccc5908 100644 --- a/src/KOKKOS/nbin_kokkos.h +++ b/src/KOKKOS/nbin_kokkos.h @@ -44,11 +44,13 @@ class NBinKokkos : public NBinStandard { int atoms_per_bin; DAT::tdual_int_1d k_bincount; DAT::tdual_int_2d k_bins; + DAT::tdual_int_1d k_atom2bin; typename AT::t_int_1d bincount; const typename AT::t_int_1d_const c_bincount; typename AT::t_int_2d bins; typename AT::t_int_2d_const c_bins; + typename AT::t_int_1d atom2bin; typename AT::t_int_scalar d_resize; typename ArrayTypes::t_int_scalar h_resize; typename AT::t_x_array_randomread x; diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index 8d6648bf2b..2f9e6e0b43 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -73,6 +73,7 @@ void NPairKokkos::copy_bin_info() atoms_per_bin = nbKK->atoms_per_bin; k_bincount = nbKK->k_bincount; k_bins = nbKK->k_bins; + k_atom2bin = nbKK->k_atom2bin; } /* ---------------------------------------------------------------------- @@ -122,6 +123,7 @@ void NPairKokkos::build(NeighList *list_) k_cutneighsq.view(), k_bincount.view(), k_bins.view(), + k_atom2bin.view(), nstencil, k_stencil.view(), k_stencilxyz.view(), @@ -166,6 +168,7 @@ void NPairKokkos::build(NeighList *list_) k_ex_mol_intra.sync(); k_bincount.sync(); k_bins.sync(); + k_atom2bin.sync(); atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK); data.special_flag[0] = special_flag[0]; @@ -317,7 +320,7 @@ void NeighborKokkosExecute:: const X_FLOAT ztmp = x(i, 2); const int itype = type(i); - const int ibin = coord2bin(xtmp, ytmp, ztmp); + const int ibin = c_atom2bin(i); const typename ArrayTypes::t_int_1d_const_um stencil = d_stencil; @@ -678,7 +681,7 @@ void NeighborKokkosExecute:: // no molecular test when i = ghost atom if (i < nlocal) { - const int ibin = coord2bin(xtmp, ytmp, ztmp); + const int ibin = c_atom2bin(i); for (int k = 0; k < nstencil; k++) { const int jbin = ibin + stencil[k]; for(int m = 0; m < c_bincount(jbin); m++) { diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h index 517ea546fa..6c1c0e958b 100644 --- a/src/KOKKOS/npair_kokkos.h +++ b/src/KOKKOS/npair_kokkos.h @@ -105,6 +105,7 @@ class NPairKokkos : public NPair { int atoms_per_bin; DAT::tdual_int_1d k_bincount; DAT::tdual_int_2d k_bins; + DAT::tdual_int_1d k_atom2bin; // data from NStencil class @@ -148,6 +149,8 @@ class NeighborKokkosExecute const typename AT::t_int_1d_const c_bincount; typename AT::t_int_2d bins; typename AT::t_int_2d_const c_bins; + const typename AT::t_int_1d atom2bin; + const typename AT::t_int_1d_const c_atom2bin; // data from NStencil class @@ -190,6 +193,7 @@ class NeighborKokkosExecute const typename AT::t_xfloat_2d_randomread &_cutneighsq, const typename AT::t_int_1d &_bincount, const typename AT::t_int_2d &_bins, + const typename AT::t_int_1d &_atom2bin, const int _nstencil, const typename AT::t_int_1d &_d_stencil, const typename AT::t_int_1d_3 &_d_stencilxyz, @@ -224,6 +228,7 @@ class NeighborKokkosExecute const int & _xprd_half, const int & _yprd_half, const int & _zprd_half): neigh_list(_neigh_list), cutneighsq(_cutneighsq), bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins), + atom2bin(_atom2bin),c_atom2bin(_atom2bin), nstencil(_nstencil),d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz), nlocal(_nlocal), x(_x),type(_type),mask(_mask),molecule(_molecule), @@ -281,38 +286,6 @@ class NeighborKokkosExecute void build_ItemCuda(typename Kokkos::TeamPolicy::member_type dev) const; #endif - KOKKOS_INLINE_FUNCTION - int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const - { - int ix,iy,iz; - - if (x >= bboxhi[0]) - ix = static_cast ((x-bboxhi[0])*bininvx) + nbinx; - else if (x >= bboxlo[0]) { - ix = static_cast ((x-bboxlo[0])*bininvx); - ix = MIN(ix,nbinx-1); - } else - ix = static_cast ((x-bboxlo[0])*bininvx) - 1; - - if (y >= bboxhi[1]) - iy = static_cast ((y-bboxhi[1])*bininvy) + nbiny; - else if (y >= bboxlo[1]) { - iy = static_cast ((y-bboxlo[1])*bininvy); - iy = MIN(iy,nbiny-1); - } else - iy = static_cast ((y-bboxlo[1])*bininvy) - 1; - - if (z >= bboxhi[2]) - iz = static_cast ((z-bboxhi[2])*bininvz) + nbinz; - else if (z >= bboxlo[2]) { - iz = static_cast ((z-bboxlo[2])*bininvz); - iz = MIN(iz,nbinz-1); - } else - iz = static_cast ((z-bboxlo[2])*bininvz) - 1; - - return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo); - } - KOKKOS_INLINE_FUNCTION int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const { From 836a6d292c10a4c1d8a77b3586d2ebeb2858cf27 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 25 Sep 2017 16:31:39 -0400 Subject: [PATCH 05/53] whitespace fixes, silence compiler warning about too few format specifiers --- src/USER-MANIFOLD/manifold_gaussian_bump.cpp | 10 +++++----- src/finish.cpp | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/USER-MANIFOLD/manifold_gaussian_bump.cpp b/src/USER-MANIFOLD/manifold_gaussian_bump.cpp index db8c589afb..a9ee35bbfc 100644 --- a/src/USER-MANIFOLD/manifold_gaussian_bump.cpp +++ b/src/USER-MANIFOLD/manifold_gaussian_bump.cpp @@ -134,7 +134,7 @@ public: // Manifold itself: manifold_gaussian_bump::manifold_gaussian_bump(class LAMMPS* lmp, int narg, char **arg) - : manifold(lmp), lut_z(NULL), lut_zp(NULL) {} + : manifold(lmp), lut_z(NULL), lut_zp(NULL) {} manifold_gaussian_bump::~manifold_gaussian_bump() @@ -361,13 +361,13 @@ void manifold_gaussian_bump::test_lut() n( x, nn ); double taper_z; if( xx <= rc1 ){ - taper_z = gaussian_bump(xx); + taper_z = gaussian_bump(xx); }else if( xx < rc2 ){ - taper_z = lut_get_z( xx ); + taper_z = lut_get_z( xx ); }else{ - taper_z = 0.0; + taper_z = 0.0; } - fprintf( fp, "%g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z, + fprintf( fp, "%g %g %g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z, gg, nn[0], nn[1], nn[2] ); } fclose(fp); diff --git a/src/finish.cpp b/src/finish.cpp index 45e9226388..c22ecaae60 100644 --- a/src/finish.cpp +++ b/src/finish.cpp @@ -130,7 +130,7 @@ void Finish::end(int flag) atom->natoms); if (logfile) fprintf(logfile,fmt1,time_loop,ntasks,update->nsteps, atom->natoms); - + // Gromacs/NAMD-style performance metric for suitable unit settings if ( timeflag && !minflag && !prdflag && !tadflag && @@ -144,7 +144,7 @@ void Finish::end(int flag) double one_fs = force->femtosecond; double t_step = ((double) time_loop) / ((double) update->nsteps); double step_t = 1.0/t_step; - + if (strcmp(update->unit_style,"lj") == 0) { double tau_day = 24.0*3600.0 / t_step * update->dt / one_fs; const char perf[] = "Performance: %.3f tau/day, %.3f timesteps/s\n"; @@ -161,7 +161,7 @@ void Finish::end(int flag) } // CPU use on MPI tasks and OpenMP threads - + if (timeflag) { if (lmp->kokkos) { const char fmt2[] = From e6969002ce55f10db0a7bebd073b2f93f947f14b Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 25 Sep 2017 19:37:37 -0400 Subject: [PATCH 06/53] having plain filelink instead of filelink.o confuses KOKKOS linking with nvcc --- lib/latte/Install.py | 6 +++--- lib/latte/Makefile.lammps.gfortran | 2 +- lib/latte/Makefile.lammps.ifort | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/latte/Install.py b/lib/latte/Install.py index b3e771e4cc..37cb5d6b17 100644 --- a/lib/latte/Install.py +++ b/lib/latte/Install.py @@ -159,13 +159,13 @@ if buildflag or pathflag: os.remove("includelink") if os.path.isfile("liblink") or os.path.islink("liblink"): os.remove("liblink") - if os.path.isfile("filelink") or os.path.islink("filelink"): - os.remove("filelink") + if os.path.isfile("filelink.o") or os.path.islink("filelink.o"): + os.remove("filelink.o") cmd = 'ln -s "%s/src" includelink' % lattedir subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) cmd = 'ln -s "%s" liblink' % lattedir subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) - cmd = 'ln -s "%s/src/latte_c_bind.o" filelink' % lattedir + cmd = 'ln -s "%s/src/latte_c_bind.o" filelink.o' % lattedir subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) # copy Makefile.lammps.suffix to Makefile.lammps diff --git a/lib/latte/Makefile.lammps.gfortran b/lib/latte/Makefile.lammps.gfortran index 921721552b..6aa7782f8a 100644 --- a/lib/latte/Makefile.lammps.gfortran +++ b/lib/latte/Makefile.lammps.gfortran @@ -3,5 +3,5 @@ # GNU Fortran settings latte_SYSINC = -latte_SYSLIB = ../../lib/latte/filelink -llatte -lgfortran -llapack -lblas +latte_SYSLIB = ../../lib/latte/filelink.o -llatte -lgfortran -llapack -lblas latte_SYSPATH = -fopenmp diff --git a/lib/latte/Makefile.lammps.ifort b/lib/latte/Makefile.lammps.ifort index 23d2b32fcc..0491bdd8a5 100644 --- a/lib/latte/Makefile.lammps.ifort +++ b/lib/latte/Makefile.lammps.ifort @@ -3,7 +3,7 @@ # Intel ifort settings latte_SYSINC = -latte_SYSLIB = ../../lib/latte/filelink \ +latte_SYSLIB = ../../lib/latte/filelink.o \ -llatte -lifcore -lsvml -lompstub -limf -lmkl_intel_lp64 \ -lmkl_intel_thread -lmkl_core -lmkl_intel_thread -lpthread \ -openmp -O0 From 0573aaa6da3a7a439b347b9974d6f596078d8479 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 25 Sep 2017 19:37:55 -0400 Subject: [PATCH 07/53] update src/.gitignore for LATTE package --- src/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/.gitignore b/src/.gitignore index 1571065b72..13518abbe8 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -405,6 +405,8 @@ /fix_lambdah_calc.h /fix_langevin_eff.cpp /fix_langevin_eff.h +/fix_latte.cpp +/fix_latte.h /fix_lb_fluid.cpp /fix_lb_fluid.h /fix_lb_momentum.cpp From 38530415c8f0cd81d5cb57215a5b09eec877c917 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 25 Sep 2017 20:03:53 -0400 Subject: [PATCH 08/53] -ltbbmalloc is required --- src/MAKE/OPTIONS/Makefile.intel_cpu_mpich | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich index 40d517bce4..7ca59e7b1c 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich @@ -15,7 +15,7 @@ DEPFLAGS = -M LINK = mpicxx -cxx=icc LINKFLAGS = -g -qopenmp $(OPTFLAGS) -LIB = +LIB = -ltbbmalloc SIZE = size ARCHIVE = ar From b60cff7e7773573b0a6a2619e7e8e4c8ee9148b1 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 25 Sep 2017 23:15:49 -0400 Subject: [PATCH 09/53] USER-OMP package depends on USER-DRUDE --- src/Depend.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Depend.sh b/src/Depend.sh index 9463607960..e1c812ebc2 100644 --- a/src/Depend.sh +++ b/src/Depend.sh @@ -119,6 +119,10 @@ if (test $1 = "USER-DPD") then depend KOKKOS fi +if (test $1 = "USER-DRUDE") then + depend USER-OMP +fi + if (test $1 = "USER-FEP") then depend USER-OMP fi From 53e4ee4f2dc9b11f8fd5e54d78ce19acb6361e0e Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 25 Sep 2017 23:20:42 -0400 Subject: [PATCH 10/53] need to re-init timers after initial setup --- src/REPLICA/prd.cpp | 1 + src/REPLICA/tad.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/src/REPLICA/prd.cpp b/src/REPLICA/prd.cpp index 30ebc779c5..14eeac8d66 100644 --- a/src/REPLICA/prd.cpp +++ b/src/REPLICA/prd.cpp @@ -310,6 +310,7 @@ void PRD::command(int narg, char **arg) time_dephase = time_dynamics = time_quench = time_comm = time_output = 0.0; bigint clock = 0; + timer->init(); timer->barrier_start(); time_start = timer->get_wall(Timer::TOTAL); diff --git a/src/REPLICA/tad.cpp b/src/REPLICA/tad.cpp index 5a4d885224..347cd3ba67 100644 --- a/src/REPLICA/tad.cpp +++ b/src/REPLICA/tad.cpp @@ -274,6 +274,7 @@ void TAD::command(int narg, char **arg) nbuild = ndanger = 0; time_neb = time_dynamics = time_quench = time_comm = time_output = 0.0; + timer->init(); timer->barrier_start(); time_start = timer->get_wall(Timer::TOTAL); From 8bba6d3e8c63cf66078e3671be15581c1bb94203 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 26 Sep 2017 16:52:10 -0400 Subject: [PATCH 11/53] correct formatting and broken/colliding link issues with LATTE package related documentation --- doc/src/Section_packages.txt | 1 + doc/src/fix_latte.txt | 6 +++--- doc/src/fixes.txt | 1 + doc/src/pair_eam.txt | 4 ++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt index d9a9fb4163..7539d99cd0 100644 --- a/doc/src/Section_packages.txt +++ b/doc/src/Section_packages.txt @@ -728,6 +728,7 @@ make lib-latte args="-b" # download and build in lib/latte/LATTE- make lib-latte args="-p $HOME/latte" # use existing LATTE installation in $HOME/latte make lib-latte args="-b -m gfortran" # download and build in lib/latte and # copy Makefile.lammps.gfortran to Makefile.lammps +:pre Note that 3 symbolic (soft) links, "includelink" and "liblink" and "filelink", are created in lib/latte to point into the LATTE home dir. diff --git a/doc/src/fix_latte.txt b/doc/src/fix_latte.txt index f78e13b866..17b3335ad7 100644 --- a/doc/src/fix_latte.txt +++ b/doc/src/fix_latte.txt @@ -66,7 +66,7 @@ reference charge of overlapping atom-centered densities and bond integrals are parameterized using a Slater-Koster tight-binding approach. This procedure, which usually is referred to as the DFTB method has been described in detail by ("Elstner"_#Elstner) and -("Finnis"_#Finnis) and coworkers. +("Finnis"_#Finnis2) and coworkers. The work of the LATTE developers follows that of Elstner closely with respect to the physical model. However, the development of LATTE is @@ -173,7 +173,7 @@ M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58, M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58, 7260 (1998). -:link(Finnis) +:link(Finnis2) [(Finnis)] M. W. Finnis, A. T. Paxton, M. Methfessel, and M. van Schilfgarde, Phys. Rev. Lett., 81, 5149 (1998). @@ -197,7 +197,7 @@ J. Sci. Comput. 36 (2), 147-170, (2014). [(Niklasson2014)] A. M. N. Niklasson and M. Cawkwell, J. Chem. Phys., 141, 164123, (2014). -:link(Niklasson2014) +:link(Niklasson2017) [(Niklasson2017)] A. M. N. Niklasson, J. Chem. Phys., 147, 054103 (2017). :link(Niklasson2012) diff --git a/doc/src/fixes.txt b/doc/src/fixes.txt index 7000a66c51..e363273a75 100644 --- a/doc/src/fixes.txt +++ b/doc/src/fixes.txt @@ -59,6 +59,7 @@ Fixes :h1 fix_langevin fix_langevin_drude fix_langevin_eff + fix_latte fix_lb_fluid fix_lb_momentum fix_lb_pc diff --git a/doc/src/pair_eam.txt b/doc/src/pair_eam.txt index a0026432ec..03e77f53ab 100644 --- a/doc/src/pair_eam.txt +++ b/doc/src/pair_eam.txt @@ -294,7 +294,7 @@ distribution have a ".cdeam" suffix. Style {eam/fs} computes pairwise interactions for metals and metal alloys using a generalized form of EAM potentials due to Finnis and -Sinclair "(Finnis)"_#Finnis. The total energy Ei of an atom I is +Sinclair "(Finnis)"_#Finnis1. The total energy Ei of an atom I is given by :c,image(Eqs/pair_eam_fs.jpg) @@ -442,7 +442,7 @@ of Physics: Condensed Matter, 16, S2629 (2004). [(Daw)] Daw, Baskes, Phys Rev Lett, 50, 1285 (1983). Daw, Baskes, Phys Rev B, 29, 6443 (1984). -:link(Finnis) +:link(Finnis1) [(Finnis)] Finnis, Sinclair, Philosophical Magazine A, 50, 45 (1984). :link(Stukowski) From fd3ecd04812090d6fd88e2220fa25b6a3f1b3962 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 26 Sep 2017 16:52:24 -0400 Subject: [PATCH 12/53] fix typo in formatting --- doc/src/fix_neb.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/src/fix_neb.txt b/doc/src/fix_neb.txt index 52d8a7df84..73b3e31266 100644 --- a/doc/src/fix_neb.txt +++ b/doc/src/fix_neb.txt @@ -93,7 +93,7 @@ intermediate replica with the previous and the next image: Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre -Note that in this case the specified {Kspring) is in force/distance +Note that in this case the specified {Kspring} is in force/distance units. With a value of {ideal}, the spring force is computed as suggested in @@ -105,7 +105,7 @@ where RD is the "reaction coordinate" see "neb"_neb.html section, and RDideal is the ideal RD for which all the images are equally spaced. I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where I is the replica number). The meanDist is the average distance -between replicas. Note that in this case the specified {Kspring) is +between replicas. Note that in this case the specified {Kspring} is in force units. Note that the {ideal} form of nudging can often be more effective at From bfdc4acb8bbe756c4911da4aa3de7f85627d9878 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 26 Sep 2017 16:53:36 -0400 Subject: [PATCH 13/53] add missing entry for pdf version of manual --- doc/src/lammps.book | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/src/lammps.book b/doc/src/lammps.book index 86dfe78af3..b74ec49aed 100644 --- a/doc/src/lammps.book +++ b/doc/src/lammps.book @@ -187,6 +187,7 @@ fix_ipi.html fix_langevin.html fix_langevin_drude.html fix_langevin_eff.html +fix_latte.html fix_lb_fluid.html fix_lb_momentum.html fix_lb_pc.html From de45fa6e7107cd9587ea14ec96b53c9a2196fb39 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 26 Sep 2017 18:25:37 -0400 Subject: [PATCH 14/53] correct bogus links in LATTE docs --- doc/src/Section_packages.txt | 2 +- doc/src/fix_latte.txt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt index 7539d99cd0..e08784bf6c 100644 --- a/doc/src/Section_packages.txt +++ b/doc/src/Section_packages.txt @@ -705,7 +705,7 @@ dynamics can be run with LAMMPS using density-functional tight-binding quantum forces calculated by LATTE. More information on LATTE can be found at this web site: -"https://github.com/lanl/LATTE"_#latte_home. A brief technical +"https://github.com/lanl/LATTE"_latte_home. A brief technical description is given with the "fix latte"_fix_latte.html command. :link(latte_home,https://github.com/lanl/LATTE) diff --git a/doc/src/fix_latte.txt b/doc/src/fix_latte.txt index 17b3335ad7..4edd610546 100644 --- a/doc/src/fix_latte.txt +++ b/doc/src/fix_latte.txt @@ -200,8 +200,8 @@ J. Sci. Comput. 36 (2), 147-170, (2014). :link(Niklasson2017) [(Niklasson2017)] A. M. N. Niklasson, J. Chem. Phys., 147, 054103 (2017). -:link(Niklasson2012) -[(Niklasson2017)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86 +:link(Cawkwell2012) +[(Cawkwell2012)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86 (17), 174308 (2012). :link(Negre2016) From 23e283f1355179168c2bcc2783d31ec3d6d67323 Mon Sep 17 00:00:00 2001 From: Anders Hafreager Date: Wed, 27 Sep 2017 16:20:07 +0200 Subject: [PATCH 15/53] Fixed proper deletion of fixes if fix is NULL --- src/modify.cpp | 10 ++++++++-- src/modify.h | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/modify.cpp b/src/modify.cpp index 4516788aa9..b95d8868bf 100644 --- a/src/modify.cpp +++ b/src/modify.cpp @@ -110,7 +110,7 @@ Modify::~Modify() // delete all fixes // do it via delete_fix() so callbacks in Atom are also updated correctly - while (nfix) delete_fix(fix[0]->id); + while (nfix) delete_fix(0); memory->sfree(fix); memory->destroy(fmask); @@ -944,7 +944,13 @@ void Modify::delete_fix(const char *id) { int ifix = find_fix(id); if (ifix < 0) error->all(FLERR,"Could not find fix ID to delete"); - delete fix[ifix]; + delete_fix(ifix); +} + +void Modify::delete_fix(int ifix) +{ + if(fix[ifix]) + delete fix[ifix]; atom->update_callback(ifix); // move other Fixes and fmask down in list one slot diff --git a/src/modify.h b/src/modify.h index d825d5c4ef..4ec61f6d57 100644 --- a/src/modify.h +++ b/src/modify.h @@ -95,6 +95,7 @@ class Modify : protected Pointers { void add_fix(int, char **, int trysuffix=1); void modify_fix(int, char **); void delete_fix(const char *); + void delete_fix(int); int find_fix(const char *); int find_fix_by_style(const char *); int check_package(const char *); From d898afaafb7ac183a8458e971b714ad2eeb79b02 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 29 Sep 2017 09:19:38 -0400 Subject: [PATCH 16/53] use <> for system includes not "" --- src/input.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/input.cpp b/src/input.cpp index 7d11b8741b..23b89d3040 100644 --- a/src/input.cpp +++ b/src/input.cpp @@ -18,7 +18,7 @@ #include #include #include -#include "sys/stat.h" +#include #include "input.h" #include "style_command.h" #include "universe.h" From d7aac2fed53cbdd13db313241c7b9a14563326a2 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 29 Sep 2017 13:26:02 -0600 Subject: [PATCH 17/53] Add sync/modify to nbin_kokkos --- src/KOKKOS/nbin_kokkos.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/KOKKOS/nbin_kokkos.cpp b/src/KOKKOS/nbin_kokkos.cpp index b06d46d520..95ea105ad9 100644 --- a/src/KOKKOS/nbin_kokkos.cpp +++ b/src/KOKKOS/nbin_kokkos.cpp @@ -90,6 +90,10 @@ void NBinKokkos::bin_atoms() { last_bin = update->ntimestep; + k_bins.template sync(); + k_bincount.template sync(); + k_atom2bin.template sync(); + h_resize() = 1; while(h_resize() > 0) { @@ -119,6 +123,10 @@ void NBinKokkos::bin_atoms() c_bins = bins; } } + + k_bins.template modify(); + k_bincount.template modify(); + k_atom2bin.template modify(); } /* ---------------------------------------------------------------------- */ From 9f2740b7f18f29f30e30f6bc6db7bdb8a4a8173a Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 29 Sep 2017 13:41:35 -0600 Subject: [PATCH 18/53] Partially revert 01d0a5c, avoid atomics, safe because of the while loop. Worst case is the resize will happen again because max wasn't accurate --- src/KOKKOS/npair_kokkos.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index 2f9e6e0b43..fd89f5ef60 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -434,7 +434,7 @@ void NeighborKokkosExecute:: if(n > neigh_list.maxneighs) { resize() = 1; - if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n); + if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop } neigh_list.d_ilist(i) = i; @@ -644,7 +644,7 @@ void NeighborKokkosExecute::build_ItemCuda(typename Kokkos::TeamPoli if(n > neigh_list.maxneighs) { resize() = 1; - if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n); + if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop } } } @@ -767,7 +767,7 @@ void NeighborKokkosExecute:: if(n > neigh_list.maxneighs) { resize() = 1; - if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n); + if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop } neigh_list.d_ilist(i) = i; } From a86572f4fcb47b817b173cd2e4d076b2af2aa897 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 29 Sep 2017 16:20:19 -0600 Subject: [PATCH 19/53] Reduce memory churn in Kokkos package --- src/KOKKOS/comm_kokkos.cpp | 54 ++++++++++++++++++------------------- src/KOKKOS/comm_kokkos.h | 1 + src/KOKKOS/npair_kokkos.cpp | 6 +++-- 3 files changed, 31 insertions(+), 30 deletions(-) diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index f5ed0f525f..ba44ea813f 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -46,7 +46,8 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp) if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]); memory->sfree(sendlist); sendlist = NULL; - k_sendlist = ArrayTypes::tdual_int_2d(); + k_sendlist = DAT::tdual_int_2d(); + k_total_send = DAT::tdual_int_scalar("comm::k_total_send"); // error check for disallow of OpenMP threads? @@ -57,12 +58,12 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp) memory->destroy(buf_recv); buf_recv = NULL; - k_exchange_sendlist = ArrayTypes:: + k_exchange_sendlist = DAT:: tdual_int_1d("comm:k_exchange_sendlist",100); - k_exchange_copylist = ArrayTypes:: + k_exchange_copylist = DAT:: tdual_int_1d("comm:k_exchange_copylist",100); - k_count = ArrayTypes::tdual_int_1d("comm:k_count",1); - k_sendflag = ArrayTypes::tdual_int_1d("comm:k_sendflag",100); + k_count = DAT::tdual_int_1d("comm:k_count",1); + k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100); memory->destroy(maxsendlist); maxsendlist = NULL; @@ -659,11 +660,11 @@ struct BuildBorderListFunctor { int iswap,maxsendlist; int nfirst,nlast,dim; typename AT::t_int_2d sendlist; - typename AT::t_int_1d nsend; + typename AT::t_int_scalar nsend; BuildBorderListFunctor(typename AT::tdual_x_array _x, typename AT::tdual_int_2d _sendlist, - typename AT::tdual_int_1d _nsend,int _nfirst, + typename AT::tdual_int_scalar _nsend,int _nfirst, int _nlast, int _dim, X_FLOAT _lo, X_FLOAT _hi, int _iswap, int _maxsendlist): @@ -684,7 +685,7 @@ struct BuildBorderListFunctor { for (int i=teamstart + dev.team_rank(); i= lo && x(i,dim) <= hi) mysend++; } - const int my_store_pos = dev.team_scan(mysend,&nsend(0)); + const int my_store_pos = dev.team_scan(mysend,&nsend()); if (my_store_pos+mysend < maxsendlist) { mysend = my_store_pos; @@ -763,37 +764,34 @@ void CommKokkos::borders_device() { if (sendflag) { if (!bordergroup || ineed >= 2) { if (style == SINGLE) { - typename ArrayTypes::tdual_int_1d total_send("TS",1); - total_send.h_view(0) = 0; - if(exec_space == Device) { - total_send.template modify(); - total_send.template sync(); - } + k_total_send.h_view() = 0; + k_total_send.template modify(); + k_total_send.template sync(); BuildBorderListFunctor f(atomKK->k_x,k_sendlist, - total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); + k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); Kokkos::TeamPolicy config((nlast-nfirst+127)/128,128); Kokkos::parallel_for(config,f); - total_send.template modify(); - total_send.template sync(); + k_total_send.template modify(); + k_total_send.template sync(); - if(total_send.h_view(0) >= maxsendlist[iswap]) { - grow_list(iswap,total_send.h_view(0)); + if(k_total_send.h_view() >= maxsendlist[iswap]) { + grow_list(iswap,k_total_send.h_view()); k_sendlist.modify(); - total_send.h_view(0) = 0; + k_total_send.h_view() = 0; if(exec_space == Device) { - total_send.template modify(); - total_send.template sync(); + k_total_send.template modify(); + k_total_send.template sync(); } BuildBorderListFunctor f(atomKK->k_x,k_sendlist, - total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); + k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); Kokkos::TeamPolicy config((nlast-nfirst+127)/128,128); Kokkos::parallel_for(config,f); - total_send.template modify(); - total_send.template sync(); + k_total_send.template modify(); + k_total_send.template sync(); } - nsend = total_send.h_view(0); + nsend = k_total_send.h_view(); } else { error->all(FLERR,"Required border comm not yet " "implemented with Kokkos"); @@ -961,7 +959,7 @@ void CommKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space) buf_send = k_buf_send.view().ptr_on_device(); } else { - k_buf_send = ArrayTypes:: + k_buf_send = DAT:: tdual_xfloat_2d("comm:k_buf_send",maxsend_border,atom->avec->size_border); buf_send = k_buf_send.view().ptr_on_device(); } @@ -975,7 +973,7 @@ void CommKokkos::grow_recv_kokkos(int n, ExecutionSpace space) { maxrecv = static_cast (BUFFACTOR * n); int maxrecv_border = (maxrecv+BUFEXTRA+5)/atom->avec->size_border + 2; - k_buf_recv = ArrayTypes:: + k_buf_recv = DAT:: tdual_xfloat_2d("comm:k_buf_recv",maxrecv_border,atom->avec->size_border); buf_recv = k_buf_recv.view().ptr_on_device(); } diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h index a8ae973124..4065efd000 100644 --- a/src/KOKKOS/comm_kokkos.h +++ b/src/KOKKOS/comm_kokkos.h @@ -53,6 +53,7 @@ class CommKokkos : public CommBrick { protected: DAT::tdual_int_2d k_sendlist; + DAT::tdual_int_scalar k_total_send; DAT::tdual_xfloat_2d k_buf_send,k_buf_recv; DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag; DAT::tdual_int_1d k_count; diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index b568bd5c93..727a81c87f 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -88,13 +88,15 @@ void NPairKokkos::copy_stencil_info() int maxstencil = ns->get_maxstencil(); - k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil); + if (maxstencil > k_stencil.dimension_0()) + k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil); for (int k = 0; k < maxstencil; k++) k_stencil.h_view(k) = ns->stencil[k]; k_stencil.modify(); k_stencil.sync(); if (GHOST) { - k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil); + if (maxstencil > k_stencilxyz.dimension_0()) + k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil); for (int k = 0; k < maxstencil; k++) { k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0]; k_stencilxyz.h_view(k,1) = ns->stencilxyz[k][1]; From 4c71beb0240905f2e147eb4971520ac8fc1912c7 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sat, 30 Sep 2017 12:12:15 -0400 Subject: [PATCH 20/53] cleanup/simplification of compilation for fix phonon analysis tool "phana" - include the used tricubic functions directly as static functions - silence compiler warnings - define f2c.h imported data types directly or use C equivalents - since the direct LAPACK API was called and not cLAPACK, declare LAPACK interface and depend only on LAPACK - add proper dependencies - disable automatic minor version number generation. step version manually. - comment out optional spglib functionality by default --- tools/phonon/Makefile | 34 ++++++---- tools/phonon/README | 12 +--- tools/phonon/disp.cpp | 7 +- tools/phonon/dynmat.cpp | 30 +++++---- tools/phonon/dynmat.h | 5 -- tools/phonon/green.cpp | 1 - tools/phonon/interpolate.cpp | 124 ++++++++++++++++++++++++++++++++++- tools/phonon/interpolate.h | 7 +- tools/phonon/phonon.cpp | 14 ++-- tools/phonon/version.h | 2 +- 10 files changed, 182 insertions(+), 54 deletions(-) diff --git a/tools/phonon/Makefile b/tools/phonon/Makefile index 0aacb1e086..67f9b91fdf 100644 --- a/tools/phonon/Makefile +++ b/tools/phonon/Makefile @@ -1,7 +1,7 @@ .SUFFIXES : .o .cpp # compiler and flags -CC = g++ -Wno-unused-result -LINK = $(CC) -static +CC = g++ -Wall +LINK = $(CC) CFLAGS = -O3 $(DEBUG) $(UFLAG) # OFLAGS = -O3 $(DEBUG) @@ -9,18 +9,17 @@ INC = $(LPKINC) $(TCINC) $(SPGINC) LIB = $(LPKLIB) $(TCLIB) $(SPGLIB) # # cLapack library needed -LPKINC = -I/opt/libs/clapack/3.2.1/include -LPKLIB = -L/opt/libs/clapack/3.2.1/lib -lclapack -lblas -lf2c #-lm +LPKINC = +LPKLIB =-llapack # -# Tricubic library needed -TCINC = -I/opt/libs/tricubic/1.0/include -TCLIB = -L/opt/libs/tricubic/1.0/lib -ltricubic # # spglib 1.8.2, used to get the irreducible q-points # if UFLAG is not set, spglib won't be used. -UFLAG = -DUseSPG -SPGINC = -I/opt/libs/spglib/1.8.2/include -SPGLIB = -L/opt/libs/spglib/1.8.2/lib -lsymspg + +# UFLAG = -DUseSPG +# SPGINC = -I/opt/libs/spglib/1.8.2/include +# SPGLIB = -L/opt/libs/spglib/1.8.2/lib -lsymspg + # if spglib other than version 1.8.2 is used, please # modify file phonon.cpp, instruction can be found by searching 1.8.2 @@ -36,7 +35,7 @@ SRC = $(wildcard *.cpp) OBJ = $(SRC:.cpp=.o) #==================================================================== -all: ver ${EXE} +all: ${EXE} ${EXE}: $(OBJ) $(LINK) $(OFLAGS) $(OBJ) $(LIB) -o $@ @@ -59,3 +58,16 @@ ver: $(CC) $(CFLAGS) -c $< .cpp.o: $(CC) $(CFLAGS) $(INC) -c $< + +#==================================================================== +# dependencies +disp.o: disp.cpp phonon.h dynmat.h memory.h interpolate.h green.h timer.h \ + global.h +dynmat.o: dynmat.cpp dynmat.h memory.h interpolate.h version.h global.h +green.o: green.cpp green.h memory.h global.h +interpolate.o: interpolate.cpp interpolate.h memory.h global.h +main.o: main.cpp dynmat.h memory.h interpolate.h phonon.h +memory.o: memory.cpp memory.h +phonon.o: phonon.cpp phonon.h dynmat.h memory.h interpolate.h green.h \ + timer.h global.h +timer.o: timer.cpp timer.h diff --git a/tools/phonon/README b/tools/phonon/README index ae6383b6bd..b54d96d8a3 100644 --- a/tools/phonon/README +++ b/tools/phonon/README @@ -5,15 +5,9 @@ analyse the phonon related information. #------------------------------------------------------------------------------- 1. Dependencies - The clapack library is needed to solve the eigen problems, - which could be downloaded from: - http://www.netlib.org/clapack/ - - The tricubic library is also needed to do tricubic interpolations, - which could be obtained from: - http://orca.princeton.edu/francois/software/tricubic/ - or - http://1drv.ms/1J2WFYk + The LAPACK library is needed to solve the eigen problems. + http://www.netlib.org/lapack/ + Intel MKL can be used as well. The spglib is optionally needed, enabling one to evaluate the phonon density of states or vibrational thermal properties diff --git a/tools/phonon/disp.cpp b/tools/phonon/disp.cpp index 2fa603916c..218e01e7fc 100644 --- a/tools/phonon/disp.cpp +++ b/tools/phonon/disp.cpp @@ -18,7 +18,8 @@ void Phonon::pdisp() { // ask the output file name and write the header. char str[MAXLINE]; - for (int ii = 0; ii < 80; ++ii) printf("="); printf("\n"); + for (int ii = 0; ii < 80; ++ii) printf("="); + printf("\n"); #ifdef UseSPG // ask method to generate q-lines int method = 2; @@ -53,7 +54,6 @@ void Phonon::pdisp() while (1){ for (int i = 0; i < 3; ++i) qstr[i] = qend[i]; - int quit = 0; printf("\nPlease input the start q-point in unit of B1->B3, q to exit [%g %g %g]: ", qstr[0], qstr[1], qstr[2]); int n = count_words(fgets(str, MAXLINE, stdin)); ptr = strtok(str, " \t\n\r\f"); @@ -2844,7 +2844,8 @@ void Phonon::pdisp() printf("\nPhonon dispersion data are written to: %s, you can visualize the results\n", fname); printf("by invoking: `gnuplot pdisp.gnuplot; gv pdisp.eps`\n"); } - for (int ii = 0; ii < 80; ++ii) printf("="); printf("\n"); + for (int ii = 0; ii < 80; ++ii) printf("="); + printf("\n"); delete []fname; nodes.clear(); diff --git a/tools/phonon/dynmat.cpp b/tools/phonon/dynmat.cpp index e82f473130..3b7bfe8268 100644 --- a/tools/phonon/dynmat.cpp +++ b/tools/phonon/dynmat.cpp @@ -3,6 +3,11 @@ #include "version.h" #include "global.h" +extern "C" void zheevd_(char *, char *, long int *, doublecomplex *, + long int *, double *, doublecomplex *, + long int *, double *, long int *, long int *, + long int *, long int *); + // to initialize the class DynMat::DynMat(int narg, char **arg) { @@ -81,7 +86,8 @@ DynMat::DynMat(int narg, char **arg) printf("Number of atoms per unit cell : %d\n", nucell); printf("System dimension : %d\n", sysdim); printf("Boltzmann constant in used units : %g\n", boltz); - for (int i = 0; i < 80; ++i) printf("="); printf("\n"); + for (int i = 0; i < 80; ++i) printf("="); + printf("\n"); if (sysdim < 1||sysdim > 3||nx < 1||ny < 1||nz < 1||nucell < 1){ printf("Wrong values read from header of file: %s, please check the binary file!\n", binfile); fclose(fp); exit(3); @@ -117,11 +123,11 @@ DynMat::DynMat(int narg, char **arg) memory->create(attyp, nucell, "DynMat:attyp"); memory->create(M_inv_sqrt, nucell, "DynMat:M_inv_sqrt"); - if ( fread(&Tmeasure, sizeof(double), 1, fp) != 1 ){printf("\nError while reading temperature from file: %s\n", binfile); fclose(fp); exit(3);} - if ( fread(&basevec[0], sizeof(double), 9, fp) != 9 ){printf("\nError while reading lattice info from file: %s\n", binfile); fclose(fp); exit(3);} - if ( fread(basis[0], sizeof(double), fftdim, fp) != fftdim){printf("\nError while reading basis info from file: %s\n", binfile); fclose(fp); exit(3);} - if ( fread(&attyp[0], sizeof(int), nucell, fp) != nucell){printf("\nError while reading atom types from file: %s\n", binfile); fclose(fp); exit(3);} - if ( fread(&M_inv_sqrt[0], sizeof(double), nucell, fp) != nucell){printf("\nError while reading atomic masses from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(&Tmeasure, sizeof(double), 1, fp) != 1 ){printf("\nError while reading temperature from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(&basevec[0], sizeof(double), 9, fp) != 9 ){printf("\nError while reading lattice info from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(basis[0], sizeof(double), fftdim, fp) != fftdim){printf("\nError while reading basis info from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(&attyp[0], sizeof(int), nucell, fp) != nucell){printf("\nError while reading atom types from file: %s\n", binfile); fclose(fp); exit(3);} + if ( (int) fread(&M_inv_sqrt[0], sizeof(double), nucell, fp) != nucell){printf("\nError while reading atomic masses from file: %s\n", binfile); fclose(fp); exit(3);} fclose(fp); car2dir(); @@ -229,9 +235,9 @@ return; int DynMat::geteigen(double *egv, int flag) { char jobz, uplo; - integer n, lda, lwork, lrwork, *iwork, liwork, info; + long int n, lda, lwork, lrwork, *iwork, liwork, info; doublecomplex *work; - doublereal *w = &egv[0], *rwork; + double *w = &egv[0], *rwork; n = fftdim; if (flag) jobz = 'V'; @@ -338,7 +344,8 @@ void DynMat::EnforceASR() char *ptr = strtok(str," \t\n\r\f"); if (ptr) nasr = atoi(ptr); if (nasr < 1){ - for (int i=0; i<80; i++) printf("="); printf("\n"); + for (int i=0; i<80; i++) printf("="); + printf("\n"); return; } @@ -404,7 +411,8 @@ void DynMat::EnforceASR() if (i == 99){ printf("...... (%d more skiped)", fftdim-100); break;} } printf("\n"); - for (int i = 0; i < 80; ++i) printf("="); printf("\n\n"); + for (int i = 0; i < 80; ++i) printf("="); + printf("\n\n"); return; } @@ -456,7 +464,7 @@ return; * --------------------------------------------------------------------*/ void DynMat::GaussJordan(int n, double *Mat) { - int i,icol,irow,j,k,l,ll,idr,idc; + int i,icol=0,irow=0,j,k,l,ll,idr,idc; int *indxc,*indxr,*ipiv; double big, nmjk; double dum, pivinv; diff --git a/tools/phonon/dynmat.h b/tools/phonon/dynmat.h index 1d6e716584..f5bd4010b8 100644 --- a/tools/phonon/dynmat.h +++ b/tools/phonon/dynmat.h @@ -7,11 +7,6 @@ #include "memory.h" #include "interpolate.h" -extern "C"{ -#include "f2c.h" -#include "clapack.h" -} - using namespace std; class DynMat { diff --git a/tools/phonon/green.cpp b/tools/phonon/green.cpp index 8f8946dc4f..35514c03fb 100644 --- a/tools/phonon/green.cpp +++ b/tools/phonon/green.cpp @@ -224,7 +224,6 @@ void Green::recursion() { // local variables std::complex Z, rec_x, rec_x_inv; - std::complex cunit = std::complex(0.,1.); double w = wmin; diff --git a/tools/phonon/interpolate.cpp b/tools/phonon/interpolate.cpp index 8c0cbde1ce..954062d415 100644 --- a/tools/phonon/interpolate.cpp +++ b/tools/phonon/interpolate.cpp @@ -1,7 +1,125 @@ #include "interpolate.h" -#include "math.h" +#include #include "global.h" +/////////////////////// +// tricubic library code +static int A[64][64] = { +{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-3, 3, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 2,-2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 9,-9,-9, 9, 0, 0, 0, 0, 6, 3,-6,-3, 0, 0, 0, 0, 6,-6, 3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-6, 6, 6,-6, 0, 0, 0, 0,-3,-3, 3, 3, 0, 0, 0, 0,-4, 4,-2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-2,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-6, 6, 6,-6, 0, 0, 0, 0,-4,-2, 4, 2, 0, 0, 0, 0,-3, 3,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 4,-4,-4, 4, 0, 0, 0, 0, 2, 2,-2,-2, 0, 0, 0, 0, 2,-2, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9,-9,-9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3,-6,-3, 0, 0, 0, 0, 6,-6, 3,-3, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 3, 3, 0, 0, 0, 0,-4, 4,-2, 2, 0, 0, 0, 0,-2,-2,-1,-1, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4,-2, 4, 2, 0, 0, 0, 0,-3, 3,-3, 3, 0, 0, 0, 0,-2,-1,-2,-1, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,-4,-4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,-2,-2, 0, 0, 0, 0, 2,-2, 2,-2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0}, +{-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 9,-9, 0, 0,-9, 9, 0, 0, 6, 3, 0, 0,-6,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,-6, 0, 0, 3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-6, 6, 0, 0, 6,-6, 0, 0,-3,-3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 4, 0, 0,-2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-2, 0, 0,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9,-9, 0, 0,-9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 0, 0,-6,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,-6, 0, 0, 3,-3, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 0, 0, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 4, 0, 0,-2, 2, 0, 0,-2,-2, 0, 0,-1,-1, 0, 0}, +{ 9, 0,-9, 0,-9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0,-6, 0,-3, 0, 6, 0,-6, 0, 3, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 9, 0,-9, 0,-9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0,-6, 0,-3, 0, 6, 0,-6, 0, 3, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0}, +{-27,27,27,-27,27,-27,-27,27,-18,-9,18, 9,18, 9,-18,-9,-18,18,-9, 9,18,-18, 9,-9,-18,18,18,-18,-9, 9, 9,-9,-12,-6,-6,-3,12, 6, 6, 3,-12,-6,12, 6,-6,-3, 6, 3,-12,12,-6, 6,-6, 6,-3, 3,-8,-4,-4,-2,-4,-2,-2,-1}, +{18,-18,-18,18,-18,18,18,-18, 9, 9,-9,-9,-9,-9, 9, 9,12,-12, 6,-6,-12,12,-6, 6,12,-12,-12,12, 6,-6,-6, 6, 6, 6, 3, 3,-6,-6,-3,-3, 6, 6,-6,-6, 3, 3,-3,-3, 8,-8, 4,-4, 4,-4, 2,-2, 4, 4, 2, 2, 2, 2, 1, 1}, +{-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 3, 0, 3, 0,-4, 0, 4, 0,-2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-2, 0,-1, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0,-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 3, 0, 3, 0,-4, 0, 4, 0,-2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-2, 0,-1, 0,-1, 0}, +{18,-18,-18,18,-18,18,18,-18,12, 6,-12,-6,-12,-6,12, 6, 9,-9, 9,-9,-9, 9,-9, 9,12,-12,-12,12, 6,-6,-6, 6, 6, 3, 6, 3,-6,-3,-6,-3, 8, 4,-8,-4, 4, 2,-4,-2, 6,-6, 6,-6, 3,-3, 3,-3, 4, 2, 4, 2, 2, 1, 2, 1}, +{-12,12,12,-12,12,-12,-12,12,-6,-6, 6, 6, 6, 6,-6,-6,-6, 6,-6, 6, 6,-6, 6,-6,-8, 8, 8,-8,-4, 4, 4,-4,-3,-3,-3,-3, 3, 3, 3, 3,-4,-4, 4, 4,-2,-2, 2, 2,-4, 4,-4, 4,-2, 2,-2, 2,-2,-2,-2,-2,-1,-1,-1,-1}, +{ 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{-6, 6, 0, 0, 6,-6, 0, 0,-4,-2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 4,-4, 0, 0,-4, 4, 0, 0, 2, 2, 0, 0,-2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 0, 0, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4,-2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0,-3, 3, 0, 0,-2,-1, 0, 0,-2,-1, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,-4, 0, 0,-4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0,-2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 2,-2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}, +{-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0,-2, 0, 4, 0, 2, 0,-3, 0, 3, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0,-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0,-2, 0, 4, 0, 2, 0,-3, 0, 3, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0,-2, 0,-1, 0}, +{18,-18,-18,18,-18,18,18,-18,12, 6,-12,-6,-12,-6,12, 6,12,-12, 6,-6,-12,12,-6, 6, 9,-9,-9, 9, 9,-9,-9, 9, 8, 4, 4, 2,-8,-4,-4,-2, 6, 3,-6,-3, 6, 3,-6,-3, 6,-6, 3,-3, 6,-6, 3,-3, 4, 2, 2, 1, 4, 2, 2, 1}, +{-12,12,12,-12,12,-12,-12,12,-6,-6, 6, 6, 6, 6,-6,-6,-8, 8,-4, 4, 8,-8, 4,-4,-6, 6, 6,-6,-6, 6, 6,-6,-4,-4,-2,-2, 4, 4, 2, 2,-3,-3, 3, 3,-3,-3, 3, 3,-4, 4,-2, 2,-4, 4,-2, 2,-2,-2,-1,-1,-2,-2,-1,-1}, +{ 4, 0,-4, 0,-4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,-2, 0,-2, 0, 2, 0,-2, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +{ 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,-4, 0,-4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,-2, 0,-2, 0, 2, 0,-2, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0}, +{-12,12,12,-12,12,-12,-12,12,-8,-4, 8, 4, 8, 4,-8,-4,-6, 6,-6, 6, 6,-6, 6,-6,-6, 6, 6,-6,-6, 6, 6,-6,-4,-2,-4,-2, 4, 2, 4, 2,-4,-2, 4, 2,-4,-2, 4, 2,-3, 3,-3, 3,-3, 3,-3, 3,-2,-1,-2,-1,-2,-1,-2,-1}, +{ 8,-8,-8, 8,-8, 8, 8,-8, 4, 4,-4,-4,-4,-4, 4, 4, 4,-4, 4,-4,-4, 4,-4, 4, 4,-4,-4, 4, 4,-4,-4, 4, 2, 2, 2, 2,-2,-2,-2,-2, 2, 2,-2,-2, 2, 2,-2,-2, 2,-2, 2,-2, 2,-2, 2,-2, 1, 1, 1, 1, 1, 1, 1, 1}}; + +static int ijk2n(int i, int j, int k) { + return(i+4*j+16*k); +} + +/* ---------------------------------------------------------------------------- */ + +static void tricubic_get_coeff_stacked(double a[64], double x[64]) { + int i,j; + for (i=0;i<64;i++) { + a[i]=(double)(0.0); + for (j=0;j<64;j++) { + a[i]+=A[i][j]*x[j]; + } + } +} + +static void tricubic_get_coeff(double a[64], double f[8], double dfdx[8], double dfdy[8], double dfdz[8], double d2fdxdy[8], double d2fdxdz[8], double d2fdydz[8], double d3fdxdydz[8]) { + int i; + double x[64]; + for (i=0;i<8;i++) { + x[0+i]=f[i]; + x[8+i]=dfdx[i]; + x[16+i]=dfdy[i]; + x[24+i]=dfdz[i]; + x[32+i]=d2fdxdy[i]; + x[40+i]=d2fdxdz[i]; + x[48+i]=d2fdydz[i]; + x[56+i]=d3fdxdydz[i]; + } + tricubic_get_coeff_stacked(a,x); +} + +static double tricubic_eval(double a[64], double x, double y, double z) { + int i,j,k; + double ret=(double)(0.0); + /* TRICUBIC EVAL + This is the short version of tricubic_eval. It is used to compute + the value of the function at a given point (x,y,z). To compute + partial derivatives of f, use the full version with the extra args. + */ + for (i=0;i<4;i++) { + for (j=0;j<4;j++) { + for (k=0;k<4;k++) { + ret+=a[ijk2n(i,j,k)]*pow(x,i)*pow(y,j)*pow(z,k); + } + } + } + return(ret); +} + /* ---------------------------------------------------------------------------- * Constructor used to get info from caller, and prepare other necessary data * ---------------------------------------------------------------------------- */ @@ -274,7 +392,8 @@ void Interpolate::set_method() which =2-im%2; printf("Your selection: %d\n", which); - for(int i=0; i<80; i++) printf("="); printf("\n\n"); + for(int i=0; i<80; i++) printf("="); + printf("\n\n"); if (which == 1) tricubic_init(); @@ -306,4 +425,3 @@ void Interpolate::reset_gamma() return; } -/* ---------------------------------------------------------------------------- */ diff --git a/tools/phonon/interpolate.h b/tools/phonon/interpolate.h index e192fcac87..04a358ae71 100644 --- a/tools/phonon/interpolate.h +++ b/tools/phonon/interpolate.h @@ -5,11 +5,8 @@ #include "stdlib.h" #include "string.h" #include "memory.h" -#include -extern "C"{ -#include "f2c.h" -#include "clapack.h" -} + +extern "C" typedef struct { double r, i; } doublecomplex; using namespace std; diff --git a/tools/phonon/phonon.cpp b/tools/phonon/phonon.cpp index 43bea111b4..065885cf3f 100644 --- a/tools/phonon/phonon.cpp +++ b/tools/phonon/phonon.cpp @@ -42,7 +42,8 @@ Phonon::Phonon(DynMat *dm) printf("\n"); for (int i = 0; i < 37; ++i) printf("="); printf(" Menu "); - for (int i = 0; i < 37; ++i) printf("="); printf("\n"); + for (int i = 0; i < 37; ++i) printf("="); + printf("\n"); printf(" 1. Phonon DOS evaluation;\n"); printf(" 2. Phonon dispersion curves;\n"); printf(" 3. Dynamical matrix at arbitrary q;\n"); @@ -60,7 +61,8 @@ Phonon::Phonon(DynMat *dm) printf("Your choice [0]: "); if (count_words(fgets(str,MAXLINE,stdin)) > 0) job = atoi(strtok(str," \t\n\r\f")); printf("\nYour selection: %d\n", job); - for (int i = 0; i < 80; ++i) printf("=");printf("\n\n"); + for (int i = 0; i < 80; ++i) printf("="); + printf("\n\n"); // now to do the job according to user's choice if (job == 1) pdos(); @@ -414,7 +416,8 @@ void Phonon::vfanyq() dynmat->geteigen(egvs, 0); printf("q-point: [%lg %lg %lg], ", q[0], q[1], q[2]); printf("vibrational frequencies at this q-point:\n"); - for (int i = 0; i < ndim; ++i) printf("%lg ", egvs[i]); printf("\n\n"); + for (int i = 0; i < ndim; ++i) printf("%lg ", egvs[i]); + printf("\n\n"); } return; @@ -1001,7 +1004,8 @@ void Phonon::ShowCell() printf("\n"); for (int i = 0; i < 30; ++i) printf("="); printf(" Unit Cell Info "); - for (int i = 0; i < 30; ++i) printf("="); printf("\n"); + for (int i = 0; i < 30; ++i) printf("="); + printf("\n"); printf("Number of atoms in the unit cell: %d\n", dynmat->nucell); printf("Basis vectors of the unit cell:\n"); printf(" %15.8f %15.8f %15.8f\n", dynmat->basevec[0], dynmat->basevec[1], dynmat->basevec[2]); @@ -1091,7 +1095,7 @@ int Phonon::count_words(const char *line) strcpy(copy,line); char *ptr; - if (ptr = strchr(copy,'#')) *ptr = '\0'; + if ((ptr = strchr(copy,'#'))) *ptr = '\0'; if (strtok(copy," \t\n\r\f") == NULL) { memory->destroy(copy); diff --git a/tools/phonon/version.h b/tools/phonon/version.h index 8ed0e80aa7..decab631b0 100644 --- a/tools/phonon/version.h +++ b/tools/phonon/version.h @@ -1 +1 @@ -#define VERSION 7 +#define VERSION 8 From 091d0580904a43810c8c641ec91e1c82a0deb3bd Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Sat, 30 Sep 2017 17:44:15 -0400 Subject: [PATCH 21/53] Fix typo --- cmake/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 666b77ae3d..48557a43f3 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -76,7 +76,7 @@ add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN}) option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF) if(LAMMPS_EXCEPTIONS) add_definitions(-DLAMMPS_EXCEPTIONS) - set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES -DLAMMPS_EXCEPTIONS") + set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES} -DLAMMPS_EXCEPTIONS") endif() set(LAMMPS_MACHINE "" CACHE STRING "Suffix to append to lmp binary and liblammps (WON'T enable any features automatically") From 6e342d2e45dcfd864b011578caf97f0b5be2443c Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Sun, 1 Oct 2017 14:03:52 +0200 Subject: [PATCH 22/53] MAINT: bigint (int64) maps onto either long or long long, depending on platform. Automatically choose the correct one. --- src/USER-NETCDF/dump_netcdf.cpp | 39 +++++++++++++++++++-------- src/USER-NETCDF/dump_netcdf_mpiio.cpp | 35 +++++++++++++++++++----- 2 files changed, 57 insertions(+), 17 deletions(-) diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp index 971f69f7cc..3193f3c365 100644 --- a/src/USER-NETCDF/dump_netcdf.cpp +++ b/src/USER-NETCDF/dump_netcdf.cpp @@ -607,6 +607,32 @@ void DumpNetCDF::closefile() /* ---------------------------------------------------------------------- */ +template +int nc_put_var1_x(int ncid, int varid, const size_t index[], const T* tp) +{ + return nc_put_var1_double(ncid, varid, index, tp); +} + +template <> +int nc_put_var1_x(int ncid, int varid, const size_t index[], const int* tp) +{ + return nc_put_var1_int(ncid, varid, index, tp); +} + +template <> +int nc_put_var1_x(int ncid, int varid, const size_t index[], + const long* tp) +{ + return nc_put_var1_long(ncid, varid, index, tp); +} + +template <> +int nc_put_var1_x(int ncid, int varid, const size_t index[], + const long long* tp) +{ + return nc_put_var1_longlong(ncid, varid, index, tp); +} + void DumpNetCDF::write() { // open file @@ -638,13 +664,8 @@ void DumpNetCDF::write() th->keyword[i] ); } else if (th->vtype[i] == BIGINT) { -#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG) - NCERRX( nc_put_var1_long(ncid, thermovar[i], start, &th->bivalue), + NCERRX( nc_put_var1_x(ncid, thermovar[i], start, &th->bivalue), th->keyword[i] ); -#else - NCERRX( nc_put_var1_int(ncid, thermovar[i], start, &th->bivalue), - th->keyword[i] ); -#endif } } } @@ -930,11 +951,7 @@ void DumpNetCDF::write_prmtop() fprintf(f, "%%FLAG POINTERS\n"); fprintf(f, "%%FORMAT(10I8)\n"); -#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG) - fprintf(f, "%8li", ntotalgr); -#else - fprintf(f, "%8i", ntotalgr); -#endif + fprintf(f, BIGINT_FORMAT, ntotalgr); for (int i = 0; i < 11; i++) fprintf(f, "%8i", 0); fprintf(f, "\n"); diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp index 3b753b1b04..656da1b6df 100644 --- a/src/USER-NETCDF/dump_netcdf_mpiio.cpp +++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp @@ -583,6 +583,34 @@ void DumpNetCDFMPIIO::closefile() /* ---------------------------------------------------------------------- */ +template +int ncmpi_put_var1_x(int ncid, int varid, const MPI_Offset index[], + const T* tp) +{ + return ncmpi_put_var1_double(ncid, varid, index, tp); +} + +template <> +int ncmpi_put_var1_x(int ncid, int varid, const MPI_Offset index[], + const int* tp) +{ + return ncmpi_put_var1_int(ncid, varid, index, tp); +} + +template <> +int ncmpi_put_var1_x(int ncid, int varid, const MPI_Offset index[], + const long* tp) +{ + return ncmpi_put_var1_long(ncid, varid, index, tp); +} + +template <> +int ncmpi_put_var1_x(int ncid, int varid, const MPI_Offset index[], + const long long* tp) +{ + return ncmpi_put_var1_longlong(ncid, varid, index, tp); +} + void DumpNetCDFMPIIO::write() { // open file @@ -616,13 +644,8 @@ void DumpNetCDFMPIIO::write() th->keyword[i] ); } else if (th->vtype[i] == BIGINT) { -#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG) - NCERRX( ncmpi_put_var1_long(ncid, thermovar[i], start, &th->bivalue), + NCERRX( ncmpi_put_var1_x(ncid, thermovar[i], start, &th->bivalue), th->keyword[i] ); -#else - NCERRX( ncmpi_put_var1_int(ncid, thermovar[i], start, &th->bivalue), - th->keyword[i] ); -#endif } } } From 84378f8ae27fe4c181e0f676910fea3fbd9df940 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Sun, 1 Oct 2017 14:05:12 +0200 Subject: [PATCH 23/53] MAINT: Renamed _put_var1_x to _put_var1_bigint --- src/USER-NETCDF/dump_netcdf.cpp | 14 ++++---------- src/USER-NETCDF/dump_netcdf_mpiio.cpp | 15 ++++----------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp index 3193f3c365..aa2a4700a3 100644 --- a/src/USER-NETCDF/dump_netcdf.cpp +++ b/src/USER-NETCDF/dump_netcdf.cpp @@ -608,26 +608,20 @@ void DumpNetCDF::closefile() /* ---------------------------------------------------------------------- */ template -int nc_put_var1_x(int ncid, int varid, const size_t index[], const T* tp) -{ - return nc_put_var1_double(ncid, varid, index, tp); -} - -template <> -int nc_put_var1_x(int ncid, int varid, const size_t index[], const int* tp) +int nc_put_var1_bigint(int ncid, int varid, const size_t index[], const T* tp) { return nc_put_var1_int(ncid, varid, index, tp); } template <> -int nc_put_var1_x(int ncid, int varid, const size_t index[], +int nc_put_var1_bigint(int ncid, int varid, const size_t index[], const long* tp) { return nc_put_var1_long(ncid, varid, index, tp); } template <> -int nc_put_var1_x(int ncid, int varid, const size_t index[], +int nc_put_var1_bigint(int ncid, int varid, const size_t index[], const long long* tp) { return nc_put_var1_longlong(ncid, varid, index, tp); @@ -664,7 +658,7 @@ void DumpNetCDF::write() th->keyword[i] ); } else if (th->vtype[i] == BIGINT) { - NCERRX( nc_put_var1_x(ncid, thermovar[i], start, &th->bivalue), + NCERRX( nc_put_var1_bigint(ncid, thermovar[i], start, &th->bivalue), th->keyword[i] ); } } diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp index 656da1b6df..e054772c41 100644 --- a/src/USER-NETCDF/dump_netcdf_mpiio.cpp +++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp @@ -584,28 +584,21 @@ void DumpNetCDFMPIIO::closefile() /* ---------------------------------------------------------------------- */ template -int ncmpi_put_var1_x(int ncid, int varid, const MPI_Offset index[], +int ncmpi_put_var1_bigint(int ncid, int varid, const MPI_Offset index[], const T* tp) -{ - return ncmpi_put_var1_double(ncid, varid, index, tp); -} - -template <> -int ncmpi_put_var1_x(int ncid, int varid, const MPI_Offset index[], - const int* tp) { return ncmpi_put_var1_int(ncid, varid, index, tp); } template <> -int ncmpi_put_var1_x(int ncid, int varid, const MPI_Offset index[], +int ncmpi_put_var1_bigint(int ncid, int varid, const MPI_Offset index[], const long* tp) { return ncmpi_put_var1_long(ncid, varid, index, tp); } template <> -int ncmpi_put_var1_x(int ncid, int varid, const MPI_Offset index[], +int ncmpi_put_var1_bigint(int ncid, int varid, const MPI_Offset index[], const long long* tp) { return ncmpi_put_var1_longlong(ncid, varid, index, tp); @@ -644,7 +637,7 @@ void DumpNetCDFMPIIO::write() th->keyword[i] ); } else if (th->vtype[i] == BIGINT) { - NCERRX( ncmpi_put_var1_x(ncid, thermovar[i], start, &th->bivalue), + NCERRX( ncmpi_put_var1_bigint(ncid, thermovar[i], start, &th->bivalue), th->keyword[i] ); } } From 100231bba8f19b4deb3c9e73911bc7bcaca8fbd8 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Sun, 1 Oct 2017 14:21:09 +0200 Subject: [PATCH 24/53] ENH: Enable multi file writes. --- src/USER-NETCDF/dump_netcdf.cpp | 35 ++++++++++++++++++++----- src/USER-NETCDF/dump_netcdf_mpiio.cpp | 37 +++++++++++++++++++++------ 2 files changed, 57 insertions(+), 15 deletions(-) diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp index aa2a4700a3..85a2d40935 100644 --- a/src/USER-NETCDF/dump_netcdf.cpp +++ b/src/USER-NETCDF/dump_netcdf.cpp @@ -88,8 +88,8 @@ DumpNetCDF::DumpNetCDF(LAMMPS *lmp, int narg, char **arg) : if (multiproc) error->all(FLERR,"Multi-processor writes are not supported."); - if (multifile) - error->all(FLERR,"Multiple files are not supported."); + if (append_flag && multifile) + error->all(FLERR,"Cannot append when writing to multiple files."); perat = new nc_perat_t[nfield]; @@ -224,6 +224,24 @@ DumpNetCDF::~DumpNetCDF() void DumpNetCDF::openfile() { + char *filecurrent = filename; + if (multifile && !singlefile_opened) { + char *filestar = filecurrent; + filecurrent = new char[strlen(filestar) + 16]; + char *ptr = strchr(filestar,'*'); + *ptr = '\0'; + if (padflag == 0) + sprintf(filecurrent,"%s" BIGINT_FORMAT "%s", + filestar,update->ntimestep,ptr+1); + else { + char bif[8],pad[16]; + strcpy(bif,BIGINT_FORMAT); + sprintf(pad,"%%s%%0%d%s%%s",padflag,&bif[1]); + sprintf(filecurrent,pad,filestar,update->ntimestep,ptr+1); + } + *ptr = '*'; + } + if (thermo && !singlefile_opened) { if (thermovar) delete [] thermovar; thermovar = new int[output->thermo->nfield]; @@ -268,14 +286,14 @@ void DumpNetCDF::openfile() ntotalgr = group->count(igroup); if (filewriter) { - if (append_flag && access(filename, F_OK) != -1) { + if (append_flag && !multifile && access(filecurrent, F_OK) != -1) { // Fixme! Perform checks if dimensions and variables conform with // data structure standard. if (singlefile_opened) return; singlefile_opened = 1; - NCERRX( nc_open(filename, NC_WRITE, &ncid), filename ); + NCERRX( nc_open(filecurrent, NC_WRITE, &ncid), filecurrent ); // dimensions NCERRX( nc_inq_dimid(ncid, NC_FRAME_STR, &frame_dim), NC_FRAME_STR ); @@ -348,8 +366,8 @@ void DumpNetCDF::openfile() if (singlefile_opened) return; singlefile_opened = 1; - NCERRX( nc_create(filename, NC_64BIT_DATA, &ncid), - filename ); + NCERRX( nc_create(filecurrent, NC_64BIT_DATA, &ncid), + filecurrent ); // dimensions NCERRX( nc_def_dim(ncid, NC_FRAME_STR, NC_UNLIMITED, &frame_dim), @@ -601,7 +619,10 @@ void DumpNetCDF::closefile() // append next time DumpNetCDF::openfile is called append_flag = 1; // write to next frame upon next open - framei++; + if (multifile) + framei = 1; + else + framei++; } } diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp index e054772c41..271f963a4e 100644 --- a/src/USER-NETCDF/dump_netcdf_mpiio.cpp +++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp @@ -88,8 +88,8 @@ DumpNetCDFMPIIO::DumpNetCDFMPIIO(LAMMPS *lmp, int narg, char **arg) : if (multiproc) error->all(FLERR,"Multi-processor writes are not supported."); - if (multifile) - error->all(FLERR,"Multiple files are not supported."); + if (append_flag && multifile) + error->all(FLERR,"Cannot append when writing to multiple files."); perat = new nc_perat_t[nfield]; @@ -217,6 +217,24 @@ DumpNetCDFMPIIO::~DumpNetCDFMPIIO() void DumpNetCDFMPIIO::openfile() { + char *filecurrent = filename; + if (multifile && !singlefile_opened) { + char *filestar = filecurrent; + filecurrent = new char[strlen(filestar) + 16]; + char *ptr = strchr(filestar,'*'); + *ptr = '\0'; + if (padflag == 0) + sprintf(filecurrent,"%s" BIGINT_FORMAT "%s", + filestar,update->ntimestep,ptr+1); + else { + char bif[8],pad[16]; + strcpy(bif,BIGINT_FORMAT); + sprintf(pad,"%%s%%0%d%s%%s",padflag,&bif[1]); + sprintf(filecurrent,pad,filestar,update->ntimestep,ptr+1); + } + *ptr = '*'; + } + if (thermo && !singlefile_opened) { if (thermovar) delete [] thermovar; thermovar = new int[output->thermo->nfield]; @@ -260,7 +278,7 @@ void DumpNetCDFMPIIO::openfile() // get total number of atoms ntotalgr = group->count(igroup); - if (append_flag && access(filename, F_OK) != -1) { + if (append_flag && !multifile && access(filecurrent, F_OK) != -1) { // Fixme! Perform checks if dimensions and variables conform with // data structure standard. @@ -270,8 +288,8 @@ void DumpNetCDFMPIIO::openfile() if (singlefile_opened) return; singlefile_opened = 1; - NCERRX( ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL, - &ncid), filename ); + NCERRX( ncmpi_open(MPI_COMM_WORLD, filecurrent, NC_WRITE, MPI_INFO_NULL, + &ncid), filecurrent ); // dimensions NCERRX( ncmpi_inq_dimid(ncid, NC_FRAME_STR, &frame_dim), NC_FRAME_STR ); @@ -344,8 +362,8 @@ void DumpNetCDFMPIIO::openfile() if (singlefile_opened) return; singlefile_opened = 1; - NCERRX( ncmpi_create(MPI_COMM_WORLD, filename, NC_64BIT_DATA, - MPI_INFO_NULL, &ncid), filename ); + NCERRX( ncmpi_create(MPI_COMM_WORLD, filecurrent, NC_64BIT_DATA, + MPI_INFO_NULL, &ncid), filecurrent ); // dimensions NCERRX( ncmpi_def_dim(ncid, NC_FRAME_STR, NC_UNLIMITED, &frame_dim), @@ -577,7 +595,10 @@ void DumpNetCDFMPIIO::closefile() // append next time DumpNetCDFMPIIO::openfile is called append_flag = 1; // write to next frame upon next open - framei++; + if (multifile) + framei = 1; + else + framei++; } } From 56d21bfb057b18c3f0056545cc6c2c52d8175c71 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Sun, 1 Oct 2017 14:22:04 +0200 Subject: [PATCH 25/53] MAINT: Removed obsolete prmtop writer. --- src/USER-NETCDF/dump_netcdf.cpp | 58 --------------------------------- src/USER-NETCDF/dump_netcdf.h | 1 - 2 files changed, 59 deletions(-) diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp index 85a2d40935..be274b2052 100644 --- a/src/USER-NETCDF/dump_netcdf.cpp +++ b/src/USER-NETCDF/dump_netcdf.cpp @@ -947,64 +947,6 @@ int DumpNetCDF::modify_param(int narg, char **arg) /* ---------------------------------------------------------------------- */ -void DumpNetCDF::write_prmtop() -{ - char fn[1024]; - char tmp[81]; - FILE *f; - - strcpy(fn, filename); - strcat(fn, ".prmtop"); - - f = fopen(fn, "w"); - fprintf(f, "%%VERSION LAMMPS\n"); - fprintf(f, "%%FLAG TITLE\n"); - fprintf(f, "%%FORMAT(20a4)\n"); - memset(tmp, ' ', 76); - tmp[76] = '\0'; - fprintf(f, "NASN%s\n", tmp); - - fprintf(f, "%%FLAG POINTERS\n"); - fprintf(f, "%%FORMAT(10I8)\n"); - fprintf(f, BIGINT_FORMAT, ntotalgr); - for (int i = 0; i < 11; i++) - fprintf(f, "%8i", 0); - fprintf(f, "\n"); - for (int i = 0; i < 12; i++) - fprintf(f, "%8i", 0); - fprintf(f, "\n"); - for (int i = 0; i < 6; i++) - fprintf(f, "%8i", 0); - fprintf(f, "\n"); - - fprintf(f, "%%FLAG ATOM_NAME\n"); - fprintf(f, "%%FORMAT(20a4)\n"); - for (int i = 0; i < ntotalgr; i++) { - fprintf(f, "%4s", "He"); - if ((i+1) % 20 == 0) - fprintf(f, "\n"); - } - - fprintf(f, "%%FLAG CHARGE\n"); - fprintf(f, "%%FORMAT(5E16.5)\n"); - for (int i = 0; i < ntotalgr; i++) { - fprintf(f, "%16.5e", 0.0); - if ((i+1) % 5 == 0) - fprintf(f, "\n"); - } - - fprintf(f, "%%FLAG MASS\n"); - fprintf(f, "%%FORMAT(5E16.5)\n"); - for (int i = 0; i < ntotalgr; i++) { - fprintf(f, "%16.5e", 1.0); - if ((i+1) % 5 == 0) - fprintf(f, "\n"); - } - fclose(f); -} - -/* ---------------------------------------------------------------------- */ - void DumpNetCDF::ncerr(int err, const char *descr, int line) { if (err != NC_NOERR) { diff --git a/src/USER-NETCDF/dump_netcdf.h b/src/USER-NETCDF/dump_netcdf.h index b86f294d30..25d64efade 100644 --- a/src/USER-NETCDF/dump_netcdf.h +++ b/src/USER-NETCDF/dump_netcdf.h @@ -92,7 +92,6 @@ class DumpNetCDF : public DumpCustom { void closefile(); virtual void write_header(bigint); virtual void write_data(int, double *); - void write_prmtop(); virtual int modify_param(int, char **); From da7be99cc4a31636b08411e8564860f1ef254f63 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Sun, 1 Oct 2017 14:28:20 +0200 Subject: [PATCH 26/53] DOC: Added multi file example. --- doc/src/dump_netcdf.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/src/dump_netcdf.txt b/doc/src/dump_netcdf.txt index 63568137a6..70111a36a8 100644 --- a/doc/src/dump_netcdf.txt +++ b/doc/src/dump_netcdf.txt @@ -25,7 +25,8 @@ args = list of atom attributes, same as for "dump_style custom"_dump.html :l,ule dump 1 all netcdf 100 traj.nc type x y z vx vy vz dump_modify 1 append yes at -1 thermo yes -dump 1 all netcdf/mpiio 1000 traj.nc id type x y z :pre +dump 1 all netcdf/mpiio 1000 traj.nc id type x y z +dump 1 all netcdf 1000 traj.*.nc id type x y z :pre [Description:] @@ -73,4 +74,3 @@ section for more info. [Related commands:] "dump"_dump.html, "dump_modify"_dump_modify.html, "undump"_undump.html - From fbe42cda2d2458e60ba3bc6906d4c6f62cac74d6 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Sun, 1 Oct 2017 14:31:33 +0200 Subject: [PATCH 27/53] MAINT: Only set append flag when not in multifile mode. --- src/USER-NETCDF/dump_netcdf.cpp | 7 ++++--- src/USER-NETCDF/dump_netcdf_mpiio.cpp | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp index be274b2052..7156b773b3 100644 --- a/src/USER-NETCDF/dump_netcdf.cpp +++ b/src/USER-NETCDF/dump_netcdf.cpp @@ -616,13 +616,14 @@ void DumpNetCDF::closefile() if (filewriter && singlefile_opened) { NCERR( nc_close(ncid) ); singlefile_opened = 0; - // append next time DumpNetCDF::openfile is called - append_flag = 1; // write to next frame upon next open if (multifile) framei = 1; - else + else { + // append next time DumpNetCDF::openfile is called + append_flag = 1; framei++; + } } } diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp index 271f963a4e..29c2b6cb1f 100644 --- a/src/USER-NETCDF/dump_netcdf_mpiio.cpp +++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp @@ -592,13 +592,14 @@ void DumpNetCDFMPIIO::closefile() if (singlefile_opened) { NCERR( ncmpi_close(ncid) ); singlefile_opened = 0; - // append next time DumpNetCDFMPIIO::openfile is called - append_flag = 1; // write to next frame upon next open if (multifile) framei = 1; - else + else { + // append next time DumpNetCDFMPIIO::openfile is called + append_flag = 1; framei++; + } } } From a7b0d1f521afac4f6b48dfbe6ba79c6054965f6c Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Sun, 1 Oct 2017 14:40:19 +0200 Subject: [PATCH 28/53] DOC: Corrected syntax for appending at certain frame (NetCDF only) --- doc/src/dump_modify.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/src/dump_modify.txt b/doc/src/dump_modify.txt index 2ea1da3db3..6ccf40a8c5 100644 --- a/doc/src/dump_modify.txt +++ b/doc/src/dump_modify.txt @@ -16,7 +16,7 @@ dump-ID = ID of dump to modify :ulb,l one or more keyword/value pairs may be appended :l these keywords apply to various dump styles :l keyword = {append} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l - {append} arg = {yes} or {no} or {at} N + {append} arg = {yes} or {no} or {yes at} N N = index of frame written upon first dump {buffer} arg = {yes} or {no} {element} args = E1 E2 ... EN, where N = # of atom types From cf24dd026520a5283589fd89150e49da3e4f0bc2 Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Mon, 2 Oct 2017 04:53:17 -0700 Subject: [PATCH 29/53] Adding pair style dpd/intel and dihedral style fourier/intel Adding raw performance numbers for Skylake xeon server. Fixes for using older Intel compilers and compiling without OpenMP. Fix adding in hooks for using USER-INTEL w/ minimization. --- doc/src/JPG/user_intel.png | Bin 20491 -> 19528 bytes doc/src/accelerate_intel.txt | 8 +- doc/src/dihedral_fourier.txt | 1 + doc/src/pair_dpd.txt | 1 + src/USER-INTEL/README | 16 +- src/USER-INTEL/TEST/README | 20 +- src/USER-INTEL/TEST/in.intel.dpd | 48 ++ src/USER-INTEL/dihedral_fourier_intel.cpp | 441 +++++++++++++ src/USER-INTEL/dihedral_fourier_intel.h | 82 +++ src/USER-INTEL/fix_intel.cpp | 1 + src/USER-INTEL/fix_intel.h | 3 + src/USER-INTEL/intel_buffers.cpp | 3 +- src/USER-INTEL/intel_preprocess.h | 9 + src/USER-INTEL/npair_full_bin_ghost_intel.cpp | 2 - src/USER-INTEL/npair_intel.cpp | 2 - src/USER-INTEL/pair_dpd_intel.cpp | 617 ++++++++++++++++++ src/USER-INTEL/pair_dpd_intel.h | 110 ++++ 17 files changed, 1345 insertions(+), 19 deletions(-) create mode 100644 src/USER-INTEL/TEST/in.intel.dpd create mode 100644 src/USER-INTEL/dihedral_fourier_intel.cpp create mode 100644 src/USER-INTEL/dihedral_fourier_intel.h create mode 100644 src/USER-INTEL/pair_dpd_intel.cpp create mode 100644 src/USER-INTEL/pair_dpd_intel.h diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png index 7ec83b3207b06c4bbda7d56f2a7d9d94a15d115d..5061f1af2e26d9c2c1110390143d9ebf96946bd4 100755 GIT binary patch literal 19528 zcmeHu1z1#V*X|$)f(jz3)Jvy?;2^D(fOLpRNeC+49ZJX0NP~1J1Bi5YNJtMYT?0rr zbM}Dxs^9z7^`HNo|D5Z)E^%Pb-s@TGUTfWJt!MA;ColWpD%MRb5D0WtQbJS_1iFw0 z0-*$5x&R#MI`ZZOK2WR_9|(h>U8GCEA83a6W$uGO1;Lk(buI#bW135-S%E;;t&u+{ z69yFaAkf2cNzwbt_FAh60v~31!_tiyt2`78uciloegEKj*`p9)wwYMSXWiUChAid? z=LOO`dM(K~v3n(XNo7I$phlc-L(`Zm*Z#o4A^qxm6n;?RcCw<4AOaQxg$sL~nr#mu zP=ViT)8vyM6kq_q8K5_j10T_(_)fsbQ%iYx#njPCu+ZuL;k;^4deg?tf|==cxZ#{b zAue!T#r=Zy3#zF#)k|^;@S2)Z4aDgI_4~1YJ0rfXI0ztN0^8r^!G6=JJ5~vF^w6bF z5!NXV9L~dNs-~9puR02Wj;Qn86x;+PQJlS8Ms=vwYLG@)=@Q)QPYiJ6=m`n6; z9Zg?#C$_4jWhX%afhtLU*arON-gf`Ya5{|N4gWv&<>B-?Vt)a#uZ^H5@Aq*+4MFi2 z0D-nI4s6z(jJh>?9GaVfZr=lxbgu^fZ$hg3Kk>J9@AW(&OlDT%w0YjViKGy6vziHV z01Kjl*e@{bzP-z^tNG>S_7L|8BjBo$<(>5;A?sCece4)&)C_7O$45p)rLNnP1EP+? z4N8#v{c~g2FR8sBr?*llTS|XFHb1<74OGkU7UnVi7G@{&ogxWpL)(E{5HzYmB?gMw{O*_!2I_fDFV|-J1Tb+{d402)2x!FM*z3lZW5807Bfo zz49q@Z;Fi*-j~>Y*j=aIOCVw_d?)$%VgHl$bu;JKi=Zo4{RJ4h_?ujhq$^zRlV~@f zg90&;hq#(fkBqX{j-%e~?%xK3o=PBPW-#s^t0e{q2o9ryye|UQ8OG|$`BKO>;G}#% zkfDnM0&%j779fsKDOz(|~q*zDxr(t(3;jfqqDB zr0cG!F>qTc!ZO9dc^g|*ZPe?F_Ou>TNA4Ebi7$D*V${b|`l~4W#2Vj1S zkbjM`gR&E-HtWxm6$-R#OoNjeFhEzjen;=;ybi!B{_KaI4uZpOZmZhmnIgyYWT2HJjhV)_W z*FTkzDAT-Khw1{Q4QYSKChcUYzcfvz8f%B1BW=Sm5LVxsO3^o9tX znAYh82Aq?7_JzH^(ZLTlNGrhiu((xRQZq*qTM#H77j0|>zyL>k{AnOs(`lA@%Am}> zkFY+;o?1o<2b}T;n&eQ~C@!SvlOB1l?`O?*|8ewqDvw$*m=4ESg8uqCPX^= zM3fGgDC{&XiBt|>s#W?~5^53oXkCmsh_ne4J+oioh5(!u?@eA)(W{RfX4)WA(9q z@T3yz7pq@HjR1V6tt66-@fPiuwXJ~igpdBSAUu4h0~F0L6}X$5+YGe;DE(*co8KqF z`;6Cd->`i_G`rnIy@&GiDjco_B{ffzj+Y!yUmUOcY;N@JgYZE=3+E(oI@sV>t1ROr za^k@UnG+f3i4oT~pJ#bxF$`}r?u23Iw~NeJfpo0@8CE@iU9)f%_?djPU*x-dsA=A% zLzyhZ_zh%}f)=?MRC)5>7Thv)IOY-rU99`bMa}q46LbfJ_ku(LxIn+awMy>gUAM5>=r5 zcX=E&WHm)x9a4~Z~I z!jF&KPV8LB%;--1iIR>bi``pBA}vl%5wRQlhNqZkQComBEqA| zW1+ECiWM@3T@e1;zd8rgzlwQ;aG@#AEn|Ig>Zbe(%6{}WeK`DbRt&by^n!%_si?H~ z+sHxk0e|(C0y0e(zblzlZYqud&!5>^tQ)*uh59+<-@590Q?*ee4h4EJZ5J6r+^miZ?jzo(<#`D2cDXAC`c%X&Kftl(rHWyx-1COm%Lnajd}_& zYsFiH3Wh&>qv+o6XI}vt8yQ|`;Hb}NkI+<~RlLw^x3GZ9VHn0q99m(ED;8c|h9}Ij$ZcjiNO zYSY^4(}B)cBhn3vZYh3eh%jw*EKW9)df|(Sb}7TNYuB|)fpO>Z-mYq6J@|ZodolrfAWVoWVRlJYNP&gv!#FJY#ZL3lnAWRw0NhQF~Ri#{yC0V|z8 za2`DW`B9o>$9GPq@fWba@C$Z{60Gx#|55b+T|_7OYTSTLk6@wd>VvtuJ))w#)cesvD8P@rrA~sAglg(_cmX7!KPp@uCz4{;_^$b^y9TP@q6{+ ziCd9aJ!e+rG59k-tF=q}SK8OilSak*M*4mXM)HvzDlJv&AvMeP_b#7vA&|X)M*%9T zFk7>U7A1K@{3BM}^a|SoH}`+z@)Tb{f30M!b!z64oRt+-I50Q$-27M51NEBg2ko4b zN9}uhg#|eI%l}$Vg#lRzbLvSW$T^q&^y3u&I$h4u&4Fn6repyHAqUG!-mwE9Tob8(gmDho zlF~l~DC{{*9ru?wQWGVL8UH#j&XNAVYQRX=-_{ltlUW(AzlSK~Zws9KfcS)H>fb@= z-&sJ#L?2h7)F`4D4AjP~mwLFC$X@t+$@_b1{THqR+&?ms)flZX&`dRp9GJfM5`a(Q zp5p_3j)IB~wp8$q9nlXm$kGWMr%GWEF!)%c;Kz}g0}*RQjz7p!`+EMJ?CJ#_{<39L zr87Jj`s#m+$TLO%a>@H!Dw(#g*b_BuO0r*4o24j3ywFiriT3E#6Bh_IRzjBd9PkHU z20#-N6J_0Bv3Yxtqf~K^SPoh~s$3Mx-NVrS@%PYEAyE^>2XX@dmcu}~sNmAsaD*g< z&&~tcntue+tB#wT+x+Yau?oTb+wEu|??)q$El3W8g~}h$M4cp(m@{YOvFF(wfoxfx zS1 zxE$~%-A%C=LU9!@9spc`o51Nnn*?NPfN#onbkCqTZ8HD>A?4ZFjqF2X^LUFcdv46; zRLH21IN(82q%Z%L&Hxl*q6&^3&y7;A>xrw71cV?b@-uSnR81JUSZ{ARFrJYEsq_<9 z$nfep>(10f0i>@0W7WUl+jC-Op#m}+9T@t5jUi|MCr=Fk1A#L7i(AgiTYv;WA1EZ@ z48Qs!DTOfl2B2Fn-mi!PeJFqk6_TTBn`wT7F*EC;fjAI7+ek1t0eb<^knxkq{ugU; zU;u&i-FtEhOGZG$e;o*mbNm5UpOuggzDW86=DjoA0LrWX9_0J@k<|uI2WbDlSjMHr z+3loY^Uqt11-F@8I%{(t0ucv31;K-39QGzfO$OBYh$Kx#XumN(i(~9<_b0v{O zVS(VkuLs2*r6#_G&&wy1SEC`vDRlq71hUlrDAwH?*OL@d_9-s{F~AC7cVztu3ZVb7 zN{cEB1fT#Q*LeR4EL2o4e%wzb`sMEFm>1C_WB}^s0uj2YN~T@F1#lWb9!r5K zK-=;cSqKn<&NphV*3Nq3leBc>fc2>mm_*3qT%59|&3kJ9v!v+~UEDkvYuzEX+$rzSXE^RNQ@OKEfFFNXnS z*`cGSj%@sX&f2rxztUTZuXS;B&AY^bL~s72N_NJj96k_}-}&Thw{p5z0=UEITgO9! zbUmB4%^B(ci-1%*4-}kLr+>}NjZKx7voiW0<$-5gWIeNGXK&LaGUyzI3-u$_NKD(1KM#u;} zv_+URst(*n-132r1JB5W@cmmg$t6#I>hy44FE5ZS+Hgk1^xvyc3Dbfx0qrtu&*qR% z|A_9dIL>zqA!bv9mb7W`2~ohvf3JRQE|L4GngS2(NO+#&J@v)+f+TtPv0eb~ zD(Zt30k*ZCV(>^)-c5V7bE@nt{VNm{72S<@5pJfx;wbveuz zjE_mBt#O~~;nEd`I;Oz(uw1~CFu{TfIzez(B3A~mk^K89HDti9RBdy2ohT8Ym(FdH zY1#4+*<~+Qtg(hVW0*pq$S}!AuCcg<&o|3lkQCD1TV-8ySOlFc@ z!aer%8#YTqD0J#Jx%v3zCYJB^9o5IQlPG*gswgO;rpTghj*a?X6=XbIJvU7Y-J-s& ze^jaSQ^(x;>W(XBg4KfbuP7Q~?NpZ{6$;^7bCt&Em{42$L~_)G4LCZwOPtw5f7wr! zNen{s$>7)ZL$F@pjTN`M&<~KzoE{$6GVM-kPpDj4zQt`|Py<5&eAYMooHoR0bn^iz zh1y(Db@gk~PlmtB_)0*YRI}&OzZ&BS6CP^VLVX9@Qid`pZ0+Nfn5DV1pQ&E*me`Sn@q^Np@k^P%Dw7fC79{mLG+IHcC4rtZi{bPzz3U zS9OuFBLm)R`{Wdt3hdZkpJyr5B*HvC+6Z0Mk@SkZE%(Sv->Wr$qk##Ja;2R@_G1hb zx{;qaKh=lvJ`rMRR@!9abz30nS#4fg#Q6A63vrYhwoJMl#qJ^EE_1N-rRTt42TZSY{Bdx@zCN@4T)C{s> zvBA&ZS#n={(nVs@xhA}Dfen5+SAq$@yXxuwm$*O1EK0*iK^81d*|?N9z&D3r5xtDn z+1Fl93h~l3KfrStTdf`Eb{RF24uPPI2AUXaVIjt*Mi$<0-PsgF5?BeIBq%iaHo=KO z!DZuwphrTT!TR$=4w$ac&zi1=c+=yHqEkIB6`uQ|B|9NACq@VH{#;2tkLHR@*#evK zI!`??wrhIBu?Fy-2xhVB6BpcAbXd~L)qGYs+4M()_{>cfqndQRb%Sr}td0j+2yU4y zZA(27kH`GLac^!yP`#@rp|kW)uXZ_HfM1Rqwp2>XPD&nqm#Sinr)9Zyd3lkxva?Ls z;sAABoE|G!aGusT*N1dn`_qDQnggnO!GoIzRA&s! zBLj+bZFv})!3t@Z(PO*zVvUx&CZwU8M^52PhlyK3dOargowv@L=J(es@Cuk zVHN9>+&dGC*uu2^?PM~C+DnpNhnQC>x}+bMmnXTvy2{Z7yAo!+u8=^C@>wjD>{`di z@bey|a0t>g8{(}_dTN!;q=%|plML@gx@7>uqeonOK^UvQ)0UnAc_vzI zkfQ3}`CgZogpF8?c6(-zvg5pRmh`z@kS<>CdZ7V7!TG~l```egC2reun@XFrhq=k% z5r-G#!%J;;tftc|XZdp|=ic^6lergbPIif@1`)Xiqx09D#8ra%>L!VO%dv0g+{P!T zbs6S@tW)H*tnA%r!iE>m1iBrxXc{cK?fTR@GQP%tF^9>`;y3+lDAD>6IqQw$v`6N< z^F~*8@YvaoJM%t0Tf?vhvk%I|*KVGrU2O)J#62wH&_aMXD@AXdui zXI=`<=Y}^vpglDk;V0=vo1{yY3KQztWRBJMKLj83>O5PYe;;t-PF|uQBE9BUKb>)q z-!2BTaDHi=0a5Wgf_c=_DAH7{-6el09iS22vN4>sBX;HzC`=*>Q%QM(qibTT-cqaW zyv77Yf^;#B1D_kJ*YrbJDn^zjUeNUkmp-|Y<*cDlL1&haV zK3MI~<(hM!@IKY)!*zY_kBCT9fR|N2z~B-H zem1#4$f$(J?3`(vjF3U)1ILs(d)|ya+TPcF52Lo-J{xoKv_*s)p!RUOeHQ)}zwgo5 zAMFg%gm2sxM3*0JA5sk|wq7fR3pQ5VsJaw(LL)XeFm|Vr&74nXYQ3L)q(ht$27mEl zwC(W&#jlneE~pl&YfJ4-a;E97V^gJAxGJ7fyS za7uQuEqeGlRi-k`7Ea%-qp0N$Z=5^ z6mugJKKLvdMh(C03DlQcUYkpvMxIm2h4YKhKnwa3l;|Jt>?X*hO38ePf8OXOQX_U!xc1Nz`8|H$G7Z zmu!8P-d2L_#B0cyQI1ur^6#{1@CO~hLu*AtideIQ-<*)2c2+)o4`W=`?5y!-?2T$D z*;pf7&BjLUU6HtC>+|{~MT>6*Pb5f$kvc=F7a_%s@89wWXU)N;5=XJ?+UVDYhb?Qd z#)3=JxP?9B7Pqw@a28!k2yGdQ9Uvf3!|M!gciXr)-ieL+THUE+r)%W2h4?m!Oxg>r z3;89}wHCx?+Zw@E0yLW@X~@pIMGB2bGhBFEH}q>(&#CSO!#g9XoA^jyZWXqo%wx<@B63=QRN`cv^J zy9?CM^;lCwO;idB!lRbK9Wn)y@o2qkRhBK=QF-lmoLz1x4ypT}PA@IVx8q=}&&qtP zP=!+{vzx9?+7f(Xdo*3n-`pXNH`TAZ>gTk{S zq4l(8y&C6Qi%CruvHTWC{f-cJQ-ipQq-m~ox$4A#faxi5dG=>^lM4GU$qRvZne8+U zYBDOT7v}s57)j<#hBgv&ywZa=@cCb>zrps=m&GJcfikqkKbJ$BY<2qbbx@I*G9pjL zLX(`HY6|`mwWd;8>=>rSy=b$FPcejyOfoPCo-^@KEcl9zLl?~ zF{tXCasg{a&!2_6PZet2yXOzt7@Z3!Grsp{jiywz7AJYJGCb+Cs%=d(Hbhf`*KatQ zG#mBYbbu2WkG+c&=HouSyJ|YN`RU}ATO2~7lX^yDAn@d61>7m@UHG$KrraTdapgYEPgyeksWY!lKX6QyF}Wt$Sk7_DJut}RedYZwpe%dgv^#tu+gcll}#3} zziXiF!x?{ko1y(zuujH%?%!%GuyS?FZTXZu#6uc~4ir=!6MP=(%QdW<_Y1hMcRmzy zoyttJpG$&8|Jo*$LPNlHU-7z&E6T|>_aQ8MR~5^265(|gz_>{~C?0BdI^|eNaIm^g zz}L5X-^j;g%vHf9P!~t*9s-knILD~zsu%PvTgBT^96NhG<)iEx?KSE=cVGZ71-HMq zSSq5LP;zuL^PL?V9gnnhySGlGO8;xmv&O(bj1fLY;sab$3!dmKk-J;5pg$~eHrah! z%l{eH6-XeIf8flKXF4CnX*#s%Wr%9-wEbZKgsF z-7HFQ#7>@Dj}7n|2k7e+x6=dqp)dITWzq#~mbjliPZvym;je#}(uWwk8mjr*#tYEF zB~?&ZQbv?DNAl^-mR>^=t5cqb3zDio(5ApA@nWpkXLIy zJM3Np9wYo^CAD-IVgb|H@jk)~oxCUfata2*ZUp-!0b)CuxxM9CbtmHEK5u4;`rC?2 zV``bxwZ~C6_U>2Tv&c1p^?bD6S+}*^@lf$!wx&msMdK8X{OBJC?XK^{*VKmvF_73% zi6}-8E!`-yZEH=pGGz$t8u5$P+@5oQhWMuo(3r|@+v?4D3r}xc>mDIaSo9JL#!n)@ zo?a=@P!K$SI4_DJPDYBix)OXvGL$XMF-RNvAb;7h;YfgXjhkzmh3SVO(|$^eCTZOg z>iVO*S9|=%CIv`eV%lB4Bdx{AFLP_dS&ilB#&&rksQAm(salIudmkDKs&Ep<@)13C zOIy#FHDP-2_Mmx7|ibSaoKd;cEeW@!E z)=MSRvL>l#D*f?UzjyzKT|A%NkSo9#+CASiAuxaHhRccr*U*_w%Qb#u+~7xznWaKB zIWWB=XL=htZw}>Dx5_ek5jfU$byzNMe=lfSK$*R@CZxQ_LNW20$o|NbjgTaW(!QbQ zm7=8QElE@6D->^hN1o%fCBJWFStUBqPbrcK@+79_R>YTW!Xm$3BKm>%VMzX9*ls># zP?R$xPE=pQQ;G_63iiOtMLqdN1e=&pT}+82QbBV6lSdNcQ;xHSrS?T}(>sr$Q(v#> z4~F|h4i0F{2=3q2@_uQuF}k|oyYsPPtc$Xtc87oa8h>l=AAES$FNv&3E8b;M^Ja57 zhk&ZRFAjX194h)`SXOW1HPygN1x~!$_BLO_P$L9XFE*XiMS_)DNJx|l=}b0eDlhIO zzrJV8ukI2gqF~?a2*ppQk5!nq@)pga>9vCs-`9wA;3HX8%NR^m^S6eH+(z!fRP5{5 z2+X@yCTM&$kdFAA`T*bXL)NGHfr#PlmoK=Bnufh&Kex?MRiFR!RfwDX5v zyh1`>g%=@`!^-ccC@s{|Lhf^ae5Say8O3WNr0Oq)XW~y!B|@cq-C-HFQ&ye(wS$3v z>dTUek0Z7s9F@M39CtCSUrwXGGWNEU)3cWWg%iQrXz__QIkPvk({0Rp#C=W=cECPj zP!FHVfhx4muel?t-d%UCdu*a$n#nNE_*EvkLjOtuBg<`O6!H%il&*a1a5YmJ#yI~w z{4}fqX>t4ZC0F7{#RQFKnzQNOykZW9E$U2iEW@6KmJlS86s01sMmG5v47;b-pB|GN zl2YeVzcLzw+L6YMOv{&8XZ`dU1^xjD1)$B0Gem|c7vFGY z^ooAciV>m0CyJ4`(41ZMdm|-tb~UnkyAU2dGx6W6J?;f|lD-m~_UlYdncLs-;p;Z` zFg8(WRJp`0T>Mz|+DpP>!IXqJ{q7T~QxQ`Z&08MzEv50G;-`Dj&d)X)N2{cl%ElxV z+-^Ri27A#zc~42iz!G+impjI?5fvOOyNY#-!`5_GQIO!Flf+o`B3XG!ghlK4myb@c z=0;H`EN;kr$Yp%O7kqSH+Jp{zDYsHtyo3Yp%w7Qc0>emM9P|`K83|Rz)0qXJK7vRB z*kd65cul9P+d_6C`b59&bOXD*|1I=w7^90=NOJ4_Rx^|x%Fuy#aQF59>lNT^BfWe4 z+VDjZAq+fCG~g|zHsiC_D)Vpcw(8eKsGmN#dxYRHKS1D?ij|Ez$kjP7oCNbVEed5G zsY?8^g%7}oSV!zk8hl;4_^xYJRF2h`d{Wy<`z3)_n3kWTv>2B~C%>#&$;?^>ZcVzq zKUoDV06{8=h?3MSt#{BWu`+U-lIydYR&u9E+D9C8Lz#j3gZ#6>3zO z%06Q|A3qWIoC+x0z#CJ*3-w!ZGN*^Yy_$zXR^-s}ERkvdm`2kD^UNT=P%Mn?-gv}GL*npoOE)CMK|7-pZ6ku=Ar58_8#4y=qNp=`JxcJ znU&=_e>>B5`X#sGYL1F-MiXkC!}_&(0>OffStP1SXQ;xHVP*r}WY7oIs?!5& zzC(It-Xd2<)a1wopQu_vCc@N%qmrMf!);9$E$CmlxY(7F_7DUUG7e7hp1>v%S>Y|z zEXG9mtDguUp(GSE?iJSNJu|M2wu8nU>G{GuktwKTFIJ%PBjW1a^i*uj*4%tD{Irfl z#mQZU7ULpzQDioJ>Qim-1uMOjgF(YWTmD9i+Z;6Q0$#7s#2r3f4!F{mYz&s-5nZPI zP^vTI?ZUT*Ru>-&IY82qrt^o%L<^w_vM)sZK9`+x$OKY8i|r|RI^3D#s(LaaW5O}k zR5s+OuIpf;?dIfvQbb)ETDs_{<}k9dI_WC#B+>3<#Jk{D_sx%_OwOsT1);OM3Xk;R2y$;;b zOdLtT`o?omGwiWOTJ{2GXcU1so*XS+!g$jxkC}OQ!&SyQ;1Sj2M8<$mR7-S%iC5$X z>39Tb#4eMtVOj$!QOI>S4BI>MU7wqAjDe^kt20n(UdF2aUTI3}beoG6c6Y{;`8L&G z@4bgY;zYY6In~Bw-N?q5ym}7AFTjYfIF9D91+y~0N`JZIa#=;pyUNtJ63YI@UMt{k zmoq~+2|2kZx1d6)mE1Bt6+{gvHc`o~#9`APgdKwAU%$|E20FGv1*+|_ux5boJ4Pk9 zY&)81&gnb}wKFR6jx*dqsj_=23fpwVyTv>e8xiDpDR16!sCiBK&Y{=@qhOqy9$Z8^y*`0a!{>MfGyBql!NS8&fOK2GL0dXelJMf;{?qqg5l zWHuqvlqOMr5&MQ|0R>ak3YORF8LeTgyllNSSm}MsVkdH{IZjVzZUF~>Pv0lfZo*rLe2?fp+=3Q1wXvjhB-t~K93n6a#@w>7bM8#H` zQ+_;r8PTa5IrknogkCnv{M;74#<3=C?K~J*sD@)BexS^?(?Y@2bTRZ3q`pku@6aJ0 z-+SKw#5kp);;|JIx&D-;;2XlW#hnj0ZFkn+nu8O0=tlC4D*p&`#MVMiGY6fvE-pZk zqQxk4(5_JMArTa}Ad2cMBJF!6REIL^6u!=e!FYNfrWX{jYW6&ce)A&S7bDUobxgwj zz(b&NQ)zQ1tV0CNhB677!JOpF@x9s4 zmxG0rW2Hmiu^%dp-5Yo0(+brctXt`^DTp%pXjTZi6AVy=Tf<&yB1#-(Is9t zMNF2EDTwux~#%j z8DPq35PbcmU70fc1+}?X;vs!r4K?dC8}mJ(Sg;Bvl-}Jf4o|iSCPz3^UcWDJh2zv+ zdVyamfF{c$9Cw7Qc-Gt~rQ5KRsK7=>P&x2Sb3tYFC{f2l=K^7$hM^Q$-oOkz=ty+A z%~*yy-5VlFd7>eS1z`c_HLUbPiwFGvLiNr$cad7-G`TVdu9*v5UpRr*y{L9?*nA=2 zuzy1f@?jcw}n#!`Fu&d}%yUG$%ym`ozi8-`amfkT11f=yjJ<2ZBd5CTI$nkl<_vUk@ zIQy)-b4oq^W?%y=$+UPzJ9TZZ5`MYcU%g?Uz;t1VkbBPwxWI-{m<+di?p8+euvxl=9?Ng%7Sx5%A0#ifh+dr7R~+v_;k$l ze)+UMBv{mibZESb>BSwEg0dSxEXP3(G_Ermw&NM`%#(k)H@f7#@N5p^7c_Zz&*^>Q z(giOuEXv-kNGGNM!U= z)m7}PSD~wGc8&x$zjQImjKxkD#_uMoO&2*SO0VcTze#y@^#zAvH-2iR1ck$4P&F_- zia(=?8z4+KXNo;c#ffa74OLY1*5f-=AWWe~qsY{4ysnK}RSu_6dYq;-UGUDj*l;+5 zF&@HPt*5G3Pltr93T7lk*tm45MPlW7Ls6?!Zc7G}q&;OlxtJzrKlx{gfutu_F!_#N z9I0K`kOOVN7wkf{_JbqrV?%-!xhmDAI-a>)25eo_&-z+kyMna@#!U@tBXnU1uRqTr zAQF482CdMOjo#I7+Zfgrl5A>}IqutD-o61m+Ae9B=VTy%mxSn}-Y}yD^7c8B*t!uX z+R7s;^3+LMiAGK=cka}`TV1_D!Dk;r?CMWqBfZ73G&S%hyaQbSi1sU6H!Rh%pWKs( z(4T0qQ1~O9E%xMHyd^z~XWA<3T66M;as6TYI*8W_mW-9L7mIjo{EK)wi%`D)*#yB- zgIS0lI4}&WX-OuTG&zx4Wbv|<%b^MF+2ji zCbN^i??^;++KKS*(4ZNk8xfJ>Yni*Z)!ZQ11ksn{(ghInYyF4Ra}1r18HfHce*ZR^ z?!8!Lw~?&Z-O4wPxUd^*bkAO_Ci>%105hH4jxO@(SK*275L$jBIsv+2C@&&o7q{TO zx4m~-;^?KZH?P0!k?(DL`<;|H3cdPZ)fxxFhsYS?cuXvh^}d>hPm_>O;WN;ekVcO z_mtYioZay?B5Mm4@*Y?6tTn#9VarlvCu8}=-FmLC03C|;xa*;;M1L8W)xCEPxb;Oq zotYvKmIxgPMgV0v4^0L9qzq_FB)}qY07{)9f)xOVfg(;6m~Gka|J7Ru+lTfxbN9A} z_DWc}c^B@ye@uvFkmCKu&~D4#>S419Piaj;n4kI#5#Az%2$x-Y=NKso!Bx)p7yvo(V7A7E2DDX?Rh6LkcJZ_~Zykkl1jKp{2JjJSEL1#S z@M*Hf4Jm$!3#QHpq*P;_o0mL#iDbuQf85tQ7oTgYWM7HE6suKPkMwae?K$rYB2yatb;2zAUx4avj zv^Ch<8gVTkyHrmKtPgq*k)&O2joWzua7_%z6;d@8--|&53>*U}7CL#MO>7O<+3$#8 z^zuSN2p4VH;Jo)Ht6vw=Mn}53u>&I;Of6E)t@@oDA;Bu$bY=&rfg%B~bQg|7!31_X zQ2ZtHBFSj3>KOhouOOuD6qUqNV$in1h?T)NOq{qkdlIm$Tl&faUyil5 zZWsEX;xWKHD?@L>%XgMtXAB+<0K_Y{!!aB#pWvvs5`TvduI93}o+nUl<`Nby3X2fmegM;BODp8$;VvEItL zk)osyLixov{21kxnK)iX_>g!95bSaDh1+>ngAx@u-dN8qmfg>ND_j$8xxfu`BXgoh zUD|Y>_ILr9?(tRbFuTnW!m0f+P_!b>*s76490ODc&(E$uk2n|) z;`n(rT)QNdC-%S|5=WQ*ob40q_A2W_s+YJIbJEa90B_dzL0^NO3V%0j03Ymy`%Y+X zZs*iy41gHL*B>#05u`ixz(9gNi1mxTH*GIL$`^oj&Ul_J_LMkkKm6muPF8~au>Og4 zGCI@tE5ISHsZ$b9sei%)zeR~z3LX6Ux1fW{|LG#n8SDt{dffsxzXyPm^2a-OVnwC# zUQfjHfd%mYSimG7Ve-izlOy)U#r{X$Q_$2y^^9oz8_!M6R zLMLN+E|1fenKxX~1FN#PM!e+TY@`Qh?F-SLv#`Qg7`K1yb^W=D;2iV6Of_-fNnW2>?FKfr%^Rc=!VSNd(SV2U!+!_DC2RpnCy?o__Ybkyn z|E)6|6s6_-=`6Wx<7$)0^ySPo|EoPqLK(iBM=sGXuY2zvXCHWIpgriLOBADK^YLfu zH(uCEZ(3i+&uPx~mhf*6ReQf9Icjli&obKeBDI|v?HBD@;q378!hH8hXAeqwFLQ)# zOy`v2C^+uD)k?T1lz6v`d*K%6*H+NmB0pDc)+G$(16(@ebCGwIDa9N)$sNVPZ}vTF zIucC11h)S^q8oNT&eL+uSVxY$eP}q6>(pVJ)yDCcW5+jdK8nsKc-eHsFE>B{5-AYDDR;b3K?-fQaf@d#7;@Z4Y(_Iul^-VjS zS~z1~d@B1&1xhHe$LkgX$!MlYH`hOH%mdIU`|~u@;I(#ZW^Fk)xbx}U0T@C&WCsj= zlAZX%W$KrEZ2;ZjQ-BA%!37W@4>*Psx@9VGv$6P(>-bIdZ?3mQxb76wr=F?qYt&Gb zzT@=%scR5#-_!2XgBA((R@KGmTrbBxOztw3#C*zM!{07y^AfFDY9o@6K9j&Xk}&co z!nrm$izizku{|~#3Cx?n-d4VDmZV)=v)j08WbWcFq5bx+Z*Qk30V3py`s+c&oZ(h`!HtEF^x1U3-i}l=nz3>2& z$%um^aX@McYZ27;cU316EQ#TQMMj0)dQUtTbnP2tNs`e01(x*Gw zaWb1@1RRffn%XMd1#|=4HeH?BfHxWSp$<|MJ^VTtVB@cRaS# z#JyirkCI>X5>rNGR8=AM-I#ZC`KckX&yr;65^uCzR_b>3o15$e6%&L-t2nv{Hd zM6*t3s<7fOi!nvm#&24Mg{h-75oKLr?M}(T!CakJ^3mb^C#z`&CE()&=^(q<0E^7jhmsYbI8WFsPuM`Qtu3zF zd+#2(-*A=#f!UvYl`<300!q$ULN3IyKp-OkenkFRR}16sTacmi>9)v(_ika=gf=_J zGkq0C{>{CB6IkJpxA-@@_L<~CP9h~KrNw(LUu5u>)IL@wXb zXhZyS-hru?Fp4&~2O(IOa!(a=#u)MJ7BlhUl`zvRv0jD)=oo~$9-b_kxj!uLW`&+F z1PdL|LH#3vQm@fSj)zn8j&auShbn2zBbaLw6)6)e`JZa9W!bS*6j=jaMIbuBK{jK6 zWaYNF71u2EIIQOW_|v>e)Ezj6{H=y4dpwyfM<1Z_gu-P=&2H}9XS1mGPJjQGb@Y_* zhkt*0z|HMMZ}1+?K>|gcG6pUJE3b`N>@FCvNN_r+sElU>5(Ao24Xu6>Qmy>&FP^VN zhws`Q!1+1d#WzhIdCa_4r^N;C-0>~xH;S168%(@An&oh}cV+sGDwaeUNOZejNhB6L zn@o@@ph8WpB`W}(f$X+IYLkXaf)UN943Y2va1rg@Y^NqhAa|D5fnIuL}CN?5s0HO z75f)mAw-QM1~e?(g}rx8*U8?!zIuEQQ!o)aa+Ta>vy6Qbw=cfI?nuw5kb-b zR)2{t{)-gumjuvTp#nBm)4g_Yaeb6A7RaPKh4XNqA~-@_VONH^q!s2b(zss>fPYUS z7t_1A7$;ENah~t~<*o)q(J8|Tpq%#QRGkv2AK;z_#wvDiC*;Bi|DTNbo(vHk+EZrp zZ@v*|%>W?KUR&bolYd$KZ!TPD6Z2g6A{Xy2dG-%3$1#heqr|IOLjb8d;P8)JAiMAR z#lXHTUR_iR(7{h;;yV|U2CCG5l+V6h0p&J=#ymamm_k6vi{M~@lLIPIh+YXDy!qQW z?7!@IZz{QK_*o1+C#d0-?jScCz=ps47C!1WN8X`3)AvuMdI}Q~7@-Ad3L|=MlXpAy zo8*1|#zS>vkhv8zXY8a|RX_)VNq*bqM?3)Pv(+I;+u`3sWB)~je?|0F;#osX$ta<6 z1B;j-+hIv_GN^wZ)L#qo8`+kuZR#ag`din3UGQ&95lHH9L-~El0BO?YL$-NP0^mpE zumq;68v$ZxhWJmg{69AAzVX~7@z&}Sb{@6{F-r8-ls`0c-+TZ&aw`UnyoElTfL#@3 zuPwA~B>2o}t9`r9Om4@IHilaEfxTty81#4P9|29rf0Aq3BlxXZP0Ni4v(WWO4Y9U# z3Q*whs_t*{Sta8znX_(vQ>ZnN&{9 z2mA-<{$$KQi`w4+`hP{W{M;=3huXCB)@7c9q7yK|XT^*u=KiX*r^4<2jplG!P7aRT z0=kR(2iwX(uTyl2hfMPq&HqIL|1X-xxPV*@Q0gy{dFb#5fTNb?f6he_Tui^ST`qwb zD5+ELuu{|ahdb}pU)KF@e$m#%fcqI0Ua`hqCN$m;udmMTD*$!Tipu1_f?b%JU@mMI z+xX!B&$l^cgzSe1@ab8x-$6@VAL8$oKxm5rNCzteBwjnI4YZSt@J&f(=&^Fxb3++JEriVP0DeXYKRkiU$iQJrx zby@oZuET$Bi;(&5LZ2IjwcjWL^osv@StQM$k9}u|`#)b8?bSQ%@e>CGnI?GbKU~`D z6WQN|B*pyi7RTR|o9k^tc*+)fc4uuxDseG%|AFlJrx|AK2a3cGq_!}X{MDPY-?3UH zbW}_q0YUnb4yv_act;Mvz=Jsr+9ugty9pe;SF!IOy!SydbmnTLca;Ap_zi*fpYpzo z;>z}jRi=L?`3p(@5t^QTXE;RO32h3b2u17PeUpJccA%GpfU_-#O&|DwBMrd2#O#o# zJb*X-qC{1uGmM|na|@EdYGXl4Pc*(Wy5r9+&o4F5cZ9sWV@CY*ocG>t2uRACx{WCRXN9q0-%aBxo_b|2sk0n3WYyQz<=DbMaKqmZm%>KKu_GPw2%5wI|ouSA! zFY8I>k>JNOUrqlUiU$cN?78|v2{pnHPSEDRAq%YF&X2_~O>C+Wx`J(G0=|GP{x9mv zBDcjSnK zB`WT3if<7ngX2I4_Vt_M`Byx?rPk5X_L~R&cdGva_@mney6xrJHXvlPyRq4m zS4sF_RgcUseF4yL?{}rDFxrFjgWq)ap0L;ITnR#++C7Y@Oe&XkA-Vs_f+&7`x~s?H ze-eaa9vXRI_}ZxbPE)X`L&TAKLdAG}*6eK^o zxQ|z9iY!NnyU5C6XeVu=&gaQP>W|4n{V+8fZ9L&^RX0DEz%?qZQ+Gw3S7!WXULO}r z3R4n3&mwZ#l5~{6Q6997lO}w6tdKL#)A=#a)FY%hF#eN2N7%(KZaxEs;~pm5S1o8( zfUc$o2B}coj@ocOT}FR+_VQ^ZOzmvLbrtE>Te(mMnCY~;Kn9`V`I!H^bL0C zSg3`L?+?I{bm$9~UHzCt4bnO31*t&r5FIaiZm2yj5xFpV4RePd(w$xZ*=^DXdOv)& z&)rU-y>uj{4Ql0~$r;enXXK`Jf3mK}fDo_E{rHvQG?o0aSF5_1@m=^*oH0urz7L98 z*~E)T!yXiltQ>~~7YpZF8bLEpbPt{-)dr2%npDi2u>lQ1nisMv*)_jFr(aP~cg|@0IG=8c zE3_)oR7;P)ICvi9jXjq`d62~fe$k{MByVB)qzEmvZ`sx<(K^XlifBo!Oh``-E5ESVTGZWW2*_mTe$s?TRtOiqDY$ieco z%S5oY9hC>?P(b1AD6C(3Ux!_pt4jNFfwoAT<>BZjmzWnZK_piXrw(NJ5=Z!EIjAr@ zL`aVyWKQG8y6UZ}#KM?0RiDGj*7TN-K5PgzEf?ttVkXnitdr}k{qU6|$i)rO?+oLj z4s&*3&d1DdJ2QMMH2p)bZv~iFUGKDTx}1G-v(nU23p61~(pm>an##>TQLrC72E(X0 z4T5Qnuxwy{+gR!%ItLf<0;e&mo*ah|Wh}o%S1Iu2B6qu@d#xPCymxfty5P5sk1Mn0W|oOLmx&cDx~o%C@;y0)jWep(9Az-+_etDb*N?%6=-*|2v;u=}R<2KIjRxWa}TPwFNVc#a04IBX5A zF61C%q{jOi)PnKWnv|+NM9U#XiFAb&Ord5S3v9zTH7yp$*ebPI(CeQtgZtCRs~t<( zJOauP>VuU{6@nA>2kQW>DjB^-B(k&;d$A;6qm|u9+rg4Opci5md|S{pR21vEVeiZM3w z)UpZDg5nZ^my2sWtPv6xUOpii?KlXhP> zgf$l|gRJFLaZj`;b|xc(BpB5eHzCQ<+P?Ax16mS!IQ?oR%pe@h#aBPE!Bj0fi0E=Z zc?Jefi|?FvjaZDG<)iy_HkIPqx$-lxmP%bTiPY&Tu?dsQA{`o@c7p`wC3H8fw7~WS z+~v3dgxIWO<_1Md_FV|~lOjWd!8`LD#ZgBxM0T8R<&$)wQIOn61~;`%LG(J`9)5+e zcLudL+API-W@e1aG!9X6rRI>5vh}Dayqc)X^ka9PT#U^{Acn#;(mSSQq^hc{9trq)RrVc*U;C&+OJ9>S3<#mg?$PIYqglosk$i@x{&_jyZ{tYjh z_{fH>F++SDe6B_)F!^)hLYYZfsLu6r3Y{Venqs)Z{hPgspvpDX78hrIsNIW+G1tiH zE_xn;dGksgbYHaY*IcT7?ISFVdC5}IN*h8odDfCfA9Q-QWc>Bb`|@@l2m^CI9P&F~ zKT$v91xDE+Pf0~+#q-(pxUN>lo$Om{Im~g)T6?Qd_G=pH6`bU9QA>KgM+DirN4CbAjVccj zh7}%2Tf^5WaM; z{;*TiKjO$$!6V|_MTUwaVXzrJnTlj67oHdJr3&ray-n%3@xlvR_^KIs@tV1K{Y+5L ztRuWeBo>zxuA88HKa0N?Bp&~iYV6qwR_$Wfc8|AUu@iplNi-=5^FtL>Ii95>JftlR zwClWZOE1<0{c|eH^cIeobB$EI=h!AQP0QcsVgv;PwdqSTg@uOnY&GdFe?!rj`W9P} zznV|MNvmWjL(!UEE9k>iG7FPy51&0ev*k6Q!7vk6naPbFvhTY<=hfeuSz4b1A9)0# z(A(KXEovuPm$p!i()zY@+cPw)u`srLiYya24MKL+bCzapezJs(vh9#jUsyJm94+;j zept|*IL~&{U4o`FRCw;%APpocN`TR0BVl8@a{BAEv;A9p2^uy^W0Ud%Tj(Mat5JEu zyX^s^KD(3OkbJI3WQxAh&CDe1U@MI!__hm>~)x|^6=bbR(TnDf!vD2G&i zN5OR?qd!2TPq^N*xA^sX56aC^G#4r7>y(Ua)TtEjG+~CX=ZMn_Kbj_P@Yf3GPpJ9Q zsG#d_-(hM89>_zz74?uZd);rt;`RWjEVeq9lvNqEb;pP!lNoeu4HL$@8ihKUxe=vAlnGf~;5T zN&QP6<|v`EsQ7fqxw)=vk=;Be4k?;v=L9RS@UJ~O1r zvLqtC^pT`ZBOxHexR(A@Y9{(FHF)+F^Kps>P9ua!a6gGo$%lx=twXYkJm!Lxk($IB z$`RY01Y^`1q}i`wmqW>tR+l)amVFv=?x*5d&amHEGLoRT?APFp8qGmx>OikL$79li zd4o3_HU`J)BY6cjN&|Uq5}XV!gsV2VERSOk8}g0|_Z3zrD3}moi6)iXkWf3 z6iL0NHv4S6m9jj2{aJG6{jEo*HEV;}qrU{SA)1F*EnhO~H#3jB@{U5*!mk0g$r@Qg zLMUN?c(aW<&kbPO!jaASaru5)Fi=a^py+|nwP%vZeB-xTBPmGEWcQi(w7Jy*9y@(GZKtJ~7S2@(OY`1lU7)W1K2kPOcc_wJf!@?M#*jV!>&F1zm6*)lH8tJq zFXgE1JBv(Oc7)kXCM=K(K0PsR+*L8(~YrNsS7L(N#GsYR7*X~ zV8KmYsw(_~Sn@`WYM@|>n|J?5Sc87~Wfr8=d6rPeF$c{U!51GF(hux5)I%(8JOX3X zK+|3KcA-_y6tWm;TA;-k>`4LG?N}hj^cpxK`rb;hHLk$NSof9ZpuDfqR_PV9pf;b) zug*JUgl&7tO8q2450cJt?t1<`u>8VEHlx1&rq!Vn?Q(+!Dv6JPeIHd(h>OnZg1yB3 zv<9yg9S0|xEF_&W*UeqOgB|S13Ys&w6T)G(5)gN&LX;DFzD2bpvf4Ng%mjVwlI_lG ztlm!5EYV=sw5VZUxa6O)U_=rpS{BAC26RTUScgs7Z~F>3GipuaXV?w+ysSBGe(7p2g<+ zbNg>+corQ~Eb!L$t_6->cCjLBo7Pm8q;-B$5?F(88&=?S1b7j>D4j}wI=LAE|n!ndn4X#=me;ES54-M zVT3Hiu(cZx1+}`OtG%>54_$Ie#OW$R?%gPt`AB!A!{Uvq@Jn#fq3;Vc1HSEz>XZki->^(Q#aHH(B$D7>h4W(pW zvx)HTkL`94m_aA^im5K99YP_NsV9?rp2%F)OA?ln%zeT&v!#)=W!o4{LOpx<8Jn~J zct?HFK-i*Uw~SFdGhPISUA~W%HEIA~Oa^nWSR(fHNj9-$y8TXXN}cewJAs%u&We{3 ztu&!LbcS?tD;nvt0dl@86oE6M=@ic!6-mQJ`&bo7RqlLzeVa!!kv)7+pEI?zAPg$8 zlqfJ~_c^EoEx}){67zwqp={-uDc2J2Lc68!oX%xgrd;!w!jh;^O3NPGyENr5CR&eR z2ZSLv`?EQ@7}*yn&1iL{H$Ge}*=_c^rS{o@eEsqc%K7m%Q~bjG_((XWt3I{ut58j= ztp!e1<*s}EVWq14)<_H5Q^I6V93KfLbsvJ?&W~A#D4<2A9h7Pce8CE+diBxQh4NIBs0g~NoM9Hhe%Q_uns0; zvvPzy?{6&(kJb}4z+b;rH7qi1R~5KMQP)SyubJp6V1DVvkgU$*5j+2h9V4;ppw3UV zF*j!qRph75K8t~EZoGE6an{5_&>gb#go$TiBXPLsnRi-+ zc&v?`jPP;R3-{mix_NVh=0ExSc^80hJQ-+*^c{Y{dTxymI#TFd3y?P(wEgmpNl3Y_ z;iv7M9+jCQ;#8Rt<&#oTtPY84^wtVu;8=d9=dp8R?c+6;B$n2tkR|Q->-17|xgx6{ z#wBv;sM5YP2c6Uj?s3B_H^1Ne-NP<{0vL=Z3_>kzEFhR8mmKlDb!iKmyoYkn^NC9bmivx zvzlQnYU``^%CuP#mQ(kuD&@t}^vo(i*otjuo3Ev#5!U)?Vf6>0z>EAf(@QVrT(cRB zS$rgv-!YTM8GhboRAKh9F0ji+yfNc)P;Pt1d(CLb>9alc>?4%HQk6(p|KRIR#`|8p zcb>jVMrnL(SZz*phu4aR7g|*(sxDQiOckjrUiL>9L?$_WRL*%2S_-ayUn|$a*u^QE zqQbA)pKC=qckB$+Vr>>De-hh*q8V*`H@H1$`Sh#I!iq#fAVYl^t2jj~TNYdeXfe*% zjJ?e(?Y43Zs>R2?HR6nVb4yF}p~g6Frcmqi50&XWg)k1QTRffM0*1(vRMCd95d(_~ zht=W!*)3T_*VMFxBK)&?BxoA%kprcoU72p!W);X5^<6oRw`2%^OLKi)sFk&-0ar1- z5rnFP+DvK8m)h~1U{4osXl&N8M#k}*((2}hD;qXfe!f31u{09t=bZ;;S)90`GBnH0 z*afc3w2CIJojJ^#3xahBv)Vpj>v;U8wWUMBR7|B4SMN_#trnoI7Z<$lZ{(a?2N0y1 zCZ;hGW_M+X-vUXZQf`)(yJk{cL)rq1;SC)Kx3Rl4%-dw!tYgS|9bU7w#xN1?{Pygm z@C|#hQO3^h?sqssZ4r8t{|$X7xR8_kopgGcYDLlS0!%o)o?d;TBK*=T$^EAX0VSwu z@)VcLc(Jt~;vW!)XY}6j(F1(vp>(-&GdxLbk@G}rT9P{YAXYrwzUn6`hCJ@vlKHHxrH&UD$>RZkRxW`D=E(+!72Xy3F<%_cpyV!iEw*oVC?Q&HG&v zt$S7N<%Qp2ho3ClUUSsT*|#@g);4~{e#;8T&_>L(HebXNMskeDs(2%6uHg(bt@&}! zO>>$MV)(H7F+yya%2vLf>H(k-(HTd!P}ckH4u1ACT8n z(0QNHEchwHkF|Kdlv}lF0MpXIG0t&=J^hM@Hy3r%^@Y{l=2_ZiTHet((TB}FxlZN| zLv#UV4iFWJ8>*??Rw5tm{Oqr{WtuqjTMS-O7JSVkz#vY4_qlD7laOgdw$+8`_O;PT zWY@7v$job@FVCC7SH?Z-&1~F9WZ#$vAzIa>W>ppKd_if`$@y0lB|DtlLstZ`VHv|E zmXxf>zMDaZdoFVYCV_^8trUdu$h7!(x>@1b!jKcxFLhv>rSOW(7tdK&ubu9sf#iN^X&zS3o|?$uQ?EDuxHWJ#v4dFgFhwKmn}iJd@` z?he>1N=x62;dRRvS_%yk*-Qu1?uIP^E6z%Gc&^o`cgqz26}q{BrR11F8LCLDzP^?^ z`f)vNg{#@u%aw*&nkJje=DJk9qIH(cm{C$wdE8>CxDWiUql}z%$E$G*n75~I9aGh|otW-5p zylyv1Dzv?~o|`m$c3QNawJ7t+Pou<|gl!^883pvGh(`|7UR?^Cc^wsu z`TC+PY_QG)TyD96$BbL_l;~QdlU2bA8`k`KvRn&EEsY<7t%T=Gr7bb?FP~TH_J?6> z3{qYfb~k0ukB>Qy>%ASKb{)A4w6xZH;dpt^!dkvld&u;xdF>O;aV$e=^JQj&>SkgP z{B@(k=sY1kj4{JC2$3LdU+mJPtx-VN7vN~tQ^dj^eXfgFleALz3khV!0pUEb zw9q84)`uY;Q7P#@>F~R6H)@2}mEXAaCOW@}w`We#3XWov4W5b3D~}2nE+7vYGYiVs zuFV9x3+)$#1kH^Qu0ISe7aIbGwB0Kr`Hu%J#z?JSIb z!gt&z%T30`YxEBSUMXnGn3MWSU@p>+JZ0|!<*#Z%G$;g8J?=7~x%*QD{Cl$_;?aVDrAu~evr?d@4jA;_unc6GjONHE?gkubPbmh|oQ%hR> z7{B^rj;q(%c8~;bT5DT!N%Dknl$+GHW%|3{g2lK5$&n#1$6d4aFDorBg7?pIQx{6k zpG%i0f4R5?3D`7&_8gPZraXL^?!XxT*~a64*O>mfu`oQu0UIV#7r?{S!rQuqXJ~ zOT2m6e*@A$LtW^9Gc8E-N>t6tNKzNW`N^hrcGuyHnn5Oe&x6_H3M;A0TYz2$F8!7Y zQr03&t;97wkN_%82WP}&M)-ZgeJYFhRuSgKaFMNHiwIBOfwpONtr{P-S4Qz{@>4zzCf)^U z*<@`Q-m6xrR!myvkHdRRJsuxx>8_o5ymr_Z2R>!vjLRorE2)A;_)l= z!1jsQcQVw}3|I9!zVvk^tdYmCKp}4Q@KA%$h89#a*J|CUlCHCzl?qPURevmbxL|U5 zT4HWt+#{jW^s>drRIn#MWl|@@`P4vo!Pcqy6F@~iv&E?%3RF2wb!+qE4V_f;O5_kwV}_=3b`q@n(@wENj46jM^XjoRj%l1CH}y6w(6D)4*rC$3{=sE(V&3 z_j7~=OpkvMU~%P>*-oc9udhLIb>WUey4Odi*Incp0G~W5oe#a{Ry<+eSi?HDYP#_u z9eM7(ogS!%T8V|}Li;8=gl;^`MF=~mTKPn@iSKRB7bIyBb;oPj>&c43zU-JH6ImM` z{{+e-k?`_7tz^W>O+$7tyQ@dwC){6YImdCMS`JYuI6m`>aD;O0+o~wz- zpybk6vm(}xuZ3Bi3$@Ce4>T9t(kok|8#Ab3PeKW#p7CWBNVDl52TqQzJWcS+5a}Fd zFJXhcHhj%6zjc#;eqQ&_ZL_r+zd4z- m!AP(@hNdL%-Oas2H$O~2cP3(Q1^$2^k&GnxZjOY( 0" then "processors * * * grid numa" + +units lj +atom_style atomic +comm_modify mode single vel yes + +lattice fcc 3.0 +region box block 0 ${xx} 0 ${yy} 0 ${zz} +create_box 1 box +create_atoms 1 box +mass 1 1.0 + +velocity all create 1.0 87287 loop geom + +pair_style dpd 1.0 1.0 928948 +pair_coeff 1 1 25.0 4.5 + +neighbor 0.5 bin +neigh_modify delay 0 every 1 + +fix 1 all nve +timestep 0.04 + +thermo 1000 + +if "$p > 0" then "run_style verlet/power" + +if "$w > 0" then "run $w" +run ${rr} diff --git a/src/USER-INTEL/dihedral_fourier_intel.cpp b/src/USER-INTEL/dihedral_fourier_intel.cpp new file mode 100644 index 0000000000..805ffc0e25 --- /dev/null +++ b/src/USER-INTEL/dihedral_fourier_intel.cpp @@ -0,0 +1,441 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include +#include +#include "dihedral_fourier_intel.h" +#include "atom.h" +#include "comm.h" +#include "memory.h" +#include "neighbor.h" +#include "domain.h" +#include "force.h" +#include "pair.h" +#include "update.h" +#include "error.h" + +#include "suffix.h" +using namespace LAMMPS_NS; + +#define PTOLERANCE (flt_t)1.05 +#define MTOLERANCE (flt_t)-1.05 +typedef struct { int a,b,c,d,t; } int5_t; + +/* ---------------------------------------------------------------------- */ + +DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp) + : DihedralFourier(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +void DihedralFourierIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + DihedralFourier::compute(eflag, vflag); + return; + } + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralFourierIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag,vflag); + } else evflag = 0; + + if (evflag) { + if (vflag && !eflag) { + if (force->newton_bond) + eval<0,1,1>(vflag, buffers, fc); + else + eval<0,1,0>(vflag, buffers, fc); + } else { + if (force->newton_bond) + eval<1,1,1>(vflag, buffers, fc); + else + eval<1,1,0>(vflag, buffers, fc); + } + } else { + if (force->newton_bond) + eval<0,0,1>(vflag, buffers, fc); + else + eval<0,0,0>(vflag, buffers, fc); + } +} + +template +void DihedralFourierIntel::eval(const int vflag, + IntelBuffers *buffers, + const ForceConst &fc) + +{ + const int inum = neighbor->ndihedrallist; + if (inum == 0) return; + + ATOM_T * _noalias const x = buffers->get_x(0); + const int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + int f_stride; + if (NEWTON_BOND) f_stride = buffers->get_stride(nall); + else f_stride = buffers->get_stride(nlocal); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + + acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5; + if (EFLAG) oedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,fc) \ + reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int nfrom, npl, nto, tid; + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif + + FORCE_T * _noalias const f = f_start + (tid * f_stride); + if (fix->need_zero(tid)) + memset(f, 0, f_stride * sizeof(FORCE_T)); + + const int5_t * _noalias const dihedrallist = + (int5_t *) neighbor->dihedrallist[0]; + + #ifdef LMP_INTEL_USE_SIMDOFF + acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5; + if (EFLAG) sedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + } + #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5) + for (int n = nfrom; n < nto; n ++) { + #else + for (int n = nfrom; n < nto; n += npl) { + #endif + const int i1 = dihedrallist[n].a; + const int i2 = dihedrallist[n].b; + const int i3 = dihedrallist[n].c; + const int i4 = dihedrallist[n].d; + const int type = dihedrallist[n].t; + + // 1st bond + + const flt_t vb1x = x[i1].x - x[i2].x; + const flt_t vb1y = x[i1].y - x[i2].y; + const flt_t vb1z = x[i1].z - x[i2].z; + + // 2nd bond + + const flt_t vb2xm = x[i2].x - x[i3].x; + const flt_t vb2ym = x[i2].y - x[i3].y; + const flt_t vb2zm = x[i2].z - x[i3].z; + + // 3rd bond + + const flt_t vb3x = x[i4].x - x[i3].x; + const flt_t vb3y = x[i4].y - x[i3].y; + const flt_t vb3z = x[i4].z - x[i3].z; + + // c,s calculation + + const flt_t ax = vb1y*vb2zm - vb1z*vb2ym; + const flt_t ay = vb1z*vb2xm - vb1x*vb2zm; + const flt_t az = vb1x*vb2ym - vb1y*vb2xm; + const flt_t bx = vb3y*vb2zm - vb3z*vb2ym; + const flt_t by = vb3z*vb2xm - vb3x*vb2zm; + const flt_t bz = vb3x*vb2ym - vb3y*vb2xm; + + const flt_t rasq = ax*ax + ay*ay + az*az; + const flt_t rbsq = bx*bx + by*by + bz*bz; + const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm; + const flt_t rg = sqrt(rgsq); + + flt_t rginv, ra2inv, rb2inv; + rginv = ra2inv = rb2inv = (flt_t)0.0; + if (rg > 0) rginv = (flt_t)1.0/rg; + if (rasq > 0) ra2inv = (flt_t)1.0/rasq; + if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq; + const flt_t rabinv = sqrt(ra2inv*rb2inv); + + flt_t c = (ax*bx + ay*by + az*bz)*rabinv; + const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z); + + // error check + #ifndef LMP_INTEL_USE_SIMDOFF + if (c > PTOLERANCE || c < MTOLERANCE) { + int me = comm->me; + + if (screen) { + char str[128]; + sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + me,tid,update->ntimestep, + atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", + me,x[i1].x,x[i1].y,x[i1].z); + fprintf(screen," 2nd atom: %d %g %g %g\n", + me,x[i2].x,x[i2].y,x[i2].z); + fprintf(screen," 3rd atom: %d %g %g %g\n", + me,x[i3].x,x[i3].y,x[i3].z); + fprintf(screen," 4th atom: %d %g %g %g\n", + me,x[i4].x,x[i4].y,x[i4].z); + } + } + #endif + + if (c > (flt_t)1.0) c = (flt_t)1.0; + if (c < (flt_t)-1.0) c = (flt_t)-1.0; + + flt_t deng; + flt_t df = (flt_t)0.0; + if (EFLAG) deng = (flt_t)0.0; + + for (int j = 0; j < nterms[type]; j++) { + const flt_t tcos_shift = fc.bp[j][type].cos_shift; + const flt_t tsin_shift = fc.bp[j][type].sin_shift; + const flt_t tk = fc.bp[j][type].k; + const int m = fc.bp[j][type].multiplicity; + + flt_t p = (flt_t)1.0; + flt_t ddf1, df1; + ddf1 = df1 = (flt_t)0.0; + + for (int i = 0; i < m; i++) { + ddf1 = p*c - df1*s; + df1 = p*s + df1*c; + p = ddf1; + } + + p = p*tcos_shift + df1*tsin_shift; + df1 = df1*tcos_shift - ddf1*tsin_shift; + df1 *= -m; + p += (flt_t)1.0; + + if (m == 0) { + p = (flt_t)1.0 + tcos_shift; + df1 = (flt_t)0.0; + } + + if (EFLAG) deng += tk * p; + df -= tk * df1; + } + + const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm; + const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm; + const flt_t fga = fg*ra2inv*rginv; + const flt_t hgb = hg*rb2inv*rginv; + const flt_t gaa = -ra2inv*rg; + const flt_t gbb = rb2inv*rg; + + const flt_t dtfx = gaa*ax; + const flt_t dtfy = gaa*ay; + const flt_t dtfz = gaa*az; + const flt_t dtgx = fga*ax - hgb*bx; + const flt_t dtgy = fga*ay - hgb*by; + const flt_t dtgz = fga*az - hgb*bz; + const flt_t dthx = gbb*bx; + const flt_t dthy = gbb*by; + const flt_t dthz = gbb*bz; + + const flt_t sx2 = df*dtgx; + const flt_t sy2 = df*dtgy; + const flt_t sz2 = df*dtgz; + + flt_t f1x = df*dtfx; + flt_t f1y = df*dtfy; + flt_t f1z = df*dtfz; + + const flt_t f2x = sx2 - f1x; + const flt_t f2y = sy2 - f1y; + const flt_t f2z = sz2 - f1z; + + flt_t f4x = df*dthx; + flt_t f4y = df*dthy; + flt_t f4z = df*dthz; + + const flt_t f3x = -sx2 - f4x; + const flt_t f3y = -sy2 - f4y; + const flt_t f3z = -sz2 - f4z; + + if (EFLAG || VFLAG) { + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, + sv0, sv1, sv2, sv3, sv4, sv5); + #else + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal, + ov0, ov1, ov2, ov3, ov4, ov5); + #endif + } + + #ifdef LMP_INTEL_USE_SIMDOFF + #pragma simdoff + #endif + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } + + if (NEWTON_BOND || i4 < nlocal) { + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; + } + } + } // for n + #ifdef LMP_INTEL_USE_SIMDOFF + if (EFLAG) oedihedral += sedihedral; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; + } + #endif + } // omp parallel + + if (EFLAG) energy += oedihedral; + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + } + + fix->set_reduce_flag(); +} + +/* ---------------------------------------------------------------------- */ + +void DihedralFourierIntel::init_style() +{ + DihedralFourier::init_style(); + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->bond_init_check(); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralFourierIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + const int bp1 = atom->ndihedraltypes + 1; + fc.set_ntypes(bp1, setflag, nterms, memory); + + for (int i = 1; i < bp1; i++) { + if (setflag[i]) { + for (int j = 0; j < nterms[i]; j++) { + fc.bp[j][i].cos_shift = cos_shift[i][j]; + fc.bp[j][i].sin_shift = sin_shift[i][j]; + fc.bp[j][i].k = k[i][j]; + fc.bp[j][i].multiplicity = multiplicity[i][j]; + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralFourierIntel::ForceConst::set_ntypes(const int nbondtypes, + int *setflag, + int *nterms, + Memory *memory) { + if (nbondtypes != _nbondtypes) { + if (_nbondtypes > 0) + _memory->destroy(bp); + + if (nbondtypes > 0) { + _maxnterms = 1; + for (int i = 1; i <= nbondtypes; i++) + if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]); + + _memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp"); + } + } + _nbondtypes = nbondtypes; + _memory = memory; +} diff --git a/src/USER-INTEL/dihedral_fourier_intel.h b/src/USER-INTEL/dihedral_fourier_intel.h new file mode 100644 index 0000000000..a775e129f4 --- /dev/null +++ b/src/USER-INTEL/dihedral_fourier_intel.h @@ -0,0 +1,82 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef DIHEDRAL_CLASS + +DihedralStyle(fourier/intel,DihedralFourierIntel) + +#else + +#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H +#define LMP_DIHEDRAL_FOURIER_INTEL_H + +#include "dihedral_fourier.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class DihedralFourierIntel : public DihedralFourier { + + public: + DihedralFourierIntel(class LAMMPS *lmp); + virtual void compute(int, int); + void init_style(); + + private: + FixIntel *fix; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int vflag, IntelBuffers * buffers, + const ForceConst &fc); + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template + class ForceConst { + public: + typedef struct { flt_t cos_shift, sin_shift, k; + int multiplicity; } fc_packed1; + + fc_packed1 **bp; + + ForceConst() : _nbondtypes(0) {} + ~ForceConst() { set_ntypes(0, NULL, NULL, NULL); } + + void set_ntypes(const int nbondtypes, int *setflag, int *nterms, + Memory *memory); + + private: + int _nbondtypes, _maxnterms; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp index 637fc0d06e..eac48b8510 100644 --- a/src/USER-INTEL/fix_intel.cpp +++ b/src/USER-INTEL/fix_intel.cpp @@ -285,6 +285,7 @@ int FixIntel::setmask() { int mask = 0; mask |= PRE_REVERSE; + mask |= MIN_PRE_REVERSE; #ifdef _LMP_INTEL_OFFLOAD mask |= POST_FORCE; mask |= MIN_POST_FORCE; diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h index 068e5ed890..d7093e79bb 100644 --- a/src/USER-INTEL/fix_intel.h +++ b/src/USER-INTEL/fix_intel.h @@ -43,6 +43,7 @@ class FixIntel : public Fix { virtual int setmask(); virtual void init(); virtual void setup(int); + inline void min_setup(int in) { setup(in); } void setup_pre_reverse(int eflag = 0, int vflag = 0); void pair_init_check(const bool cdmessage=false); @@ -50,6 +51,8 @@ class FixIntel : public Fix { void kspace_init_check(); void pre_reverse(int eflag = 0, int vflag = 0); + inline void min_pre_reverse(int eflag = 0, int vflag = 0) + { pre_reverse(eflag, vflag); } // Get all forces, calculation results from coprocesser void sync_coprocessor(); diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp index b4b664cb94..ac208f5a0c 100644 --- a/src/USER-INTEL/intel_buffers.cpp +++ b/src/USER-INTEL/intel_buffers.cpp @@ -409,6 +409,7 @@ void IntelBuffers::grow_ccache(const int off_flag, IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0); lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef"); #endif + memset(_ccachei, 0, vsize * sizeof(int)); memset(_ccachej, 0, vsize * sizeof(int)); #ifdef _LMP_INTEL_OFFLOAD @@ -425,7 +426,7 @@ void IntelBuffers::grow_ccache(const int off_flag, #pragma offload_transfer target(mic:_cop) \ nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \ - nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \ + in(ccachei:length(vsize) alloc_if(1) free_if(0)) \ in(ccachej:length(vsize) alloc_if(1) free_if(0)) } #ifdef LMP_USE_AVXCD diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h index a7663d54a6..d49d0d8b00 100644 --- a/src/USER-INTEL/intel_preprocess.h +++ b/src/USER-INTEL/intel_preprocess.h @@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, ito = inum; \ } +#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \ + nthr, vecsize) \ + { \ + tid = 0; \ + ifrom = 0; \ + ip = 1; \ + ito = inum; \ + } + #endif #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \ diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp index 12101712f1..e6d45d7b2c 100644 --- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp +++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp @@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, const int bstart = binhead[ibin + binstart[k]]; const int bend = binhead[ibin + binend[k]]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) @@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, const int bstart = binhead[ibin + stencil[k]]; const int bend = binhead[ibin + stencil[k] + 1]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp index 79dc75366e..0068e02635 100644 --- a/src/USER-INTEL/npair_intel.cpp +++ b/src/USER-INTEL/npair_intel.cpp @@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, const int bstart = binhead[ibin + binstart[k]]; const int bend = binhead[ibin + binend[k]]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) @@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, const int bstart = binhead[ibin]; const int bend = binhead[ibin + 1]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) { diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp new file mode 100644 index 0000000000..0b5760a7b0 --- /dev/null +++ b/src/USER-INTEL/pair_dpd_intel.cpp @@ -0,0 +1,617 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + This software is distributed under the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) + Shun Xu (Computer Network Information Center, CAS) +------------------------------------------------------------------------- */ + +#include +#include "pair_dpd_intel.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "memory.h" +#include "modify.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "suffix.h" +using namespace LAMMPS_NS; + +#define LMP_MKL_RNG VSL_BRNG_MT19937 +#define FC_PACKED1_T typename ForceConst::fc_packed1 +#define IEPSILON 1.0e10 + +/* ---------------------------------------------------------------------- */ + +PairDPDIntel::PairDPDIntel(LAMMPS *lmp) : + PairDPD(lmp) +{ + suffix_flag |= Suffix::INTEL; + respa_enable = 0; + random_thread = NULL; + _nrandom_thread = 0; +} + +/* ---------------------------------------------------------------------- */ + +PairDPDIntel::~PairDPDIntel() +{ + #if defined(_OPENMP) + if (_nrandom_thread) { + #ifdef LMP_NO_MKL_RNG + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; + #else + for (int i = 0; i < _nrandom_thread; i++) + vslDeleteStream(&random_thread[i]); + #endif + } + #endif + delete []random_thread; +} + +/* ---------------------------------------------------------------------- */ + +void PairDPDIntel::compute(int eflag, int vflag) +{ + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); + + fix->balance_stamp(); + vflag_fdotr = 0; +} + +template +void PairDPDIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag, vflag); + } else evflag = vflag_fdotr = 0; + + const int inum = list->inum; + const int nthreads = comm->nthreads; + const int host_start = fix->host_start_pair(); + const int offload_end = fix->offload_end_pair(); + const int ago = neighbor->ago; + + if (ago != 0 && fix->separate_buffers() == 0) { + fix->start_watch(TIME_PACK); + + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; + #if defined(_OPENMP) + #pragma omp parallel if(packthreads > 1) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, + packthreads, sizeof(ATOM_T)); + buffers->thr_pack(ifrom,ito,ago); + } + fix->stop_watch(TIME_PACK); + } + + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (_onetype) { + if (eflag) { + if (force->newton_pair) { + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } + } else { + if (eflag) { + if (force->newton_pair) { + eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } + } +} + +template +void PairDPDIntel::eval(const int offload, const int vflag, + IntelBuffers *buffers, + const ForceConst &fc, + const int astart, const int aend) +{ + const int inum = aend - astart; + if (inum == 0) return; + int nlocal, nall, minlocal; + fix->get_buffern(offload, nlocal, nall, minlocal); + + const int ago = neighbor->ago; + IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); + + ATOM_T * _noalias const x = buffers->get_x(offload); + typedef struct { double x, y, z; } lmp_vt; + lmp_vt *v = (lmp_vt *)atom->v[0]; + const flt_t dtinvsqrt = 1.0/sqrt(update->dt); + + const int * _noalias const numneigh = list->numneigh; + const int * _noalias const cnumneigh = buffers->cnumneigh(list); + const int * _noalias const firstneigh = buffers->firstneigh(list); + const FC_PACKED1_T * _noalias const param = fc.param[0]; + const flt_t * _noalias const special_lj = fc.special_lj; + int * _noalias const rngi_thread = fc.rngi; + const int rng_size = buffers->get_max_nbors(); + + const int ntypes = atom->ntypes + 1; + const int eatom = this->eflag_atom; + + // Determine how much data to transfer + int x_size, q_size, f_stride, ev_size, separate_flag; + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + int *overflow = fix->get_off_overflow_flag(); + { + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime(); + #endif + + IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, + f_stride, x, 0); + + acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; + if (EFLAG) oevdwl = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + + // loop over neighbors of my atoms + #if defined(_OPENMP) + #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); + iifrom += astart; + iito += astart; + + #ifdef LMP_NO_MKL_RNG + RanMars *my_random = random_thread[tid]; + #else + VSLStreamStatePtr *my_random = &(random_thread[tid]); + #endif + flt_t *my_rand_buffer = fc.rand_buffer_thread[tid]; + int rngi = rngi_thread[tid]; + + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal; + else foff = -minlocal; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + + flt_t icut, a0, gamma, sigma; + if (ONETYPE) { + icut = param[3].icut; + a0 = param[3].a0; + gamma = param[3].gamma; + sigma = param[3].sigma; + } + for (int i = iifrom; i < iito; i += iip) { + int itype, ptr_off; + const FC_PACKED1_T * _noalias parami; + if (!ONETYPE) { + itype = x[i].w; + ptr_off = itype * ntypes; + parami = param + ptr_off; + } + + const int * _noalias const jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + + acc_t fxtmp, fytmp, fztmp, fwtmp; + acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; + + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + const flt_t vxtmp = v[i].x; + const flt_t vytmp = v[i].y; + const flt_t vztmp = v[i].z; + fxtmp = fytmp = fztmp = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + + if (rngi + jnum > rng_size) { + #ifdef LMP_NO_MKL_RNG + for (int jj = 0; jj < rngi; jj++) + my_rand_buffer[jj] = my_random->gaussian(); + #else + if (sizeof(flt_t) == sizeof(float)) + vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, + (float*)my_rand_buffer, (float)0.0, (float)1.0 ); + else + vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, + (double*)my_rand_buffer, 0.0, 1.0 ); + #endif + rngi = 0; + } + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) + #endif + for (int jj = 0; jj < jnum; jj++) { + flt_t forcelj, evdwl; + forcelj = evdwl = (flt_t)0.0; + + int j, jtype, sbindex; + if (!ONETYPE) { + sbindex = jlist[jj] >> SBBITS & 3; + j = jlist[jj] & NEIGHMASK; + } else + j = jlist[jj]; + + const flt_t delx = xtmp - x[j].x; + const flt_t dely = ytmp - x[j].y; + const flt_t delz = ztmp - x[j].z; + if (!ONETYPE) { + jtype = x[j].w; + icut = parami[jtype].icut; + } + const flt_t rsq = delx * delx + dely * dely + delz * delz; + const flt_t rinv = (flt_t)1.0/sqrt(rsq); + + if (rinv > icut) { + flt_t factor_dpd; + if (!ONETYPE) factor_dpd = special_lj[sbindex]; + + flt_t delvx = vxtmp - v[j].x; + flt_t delvy = vytmp - v[j].y; + flt_t delvz = vztmp - v[j].z; + flt_t dot = delx*delvx + dely*delvy + delz*delvz; + flt_t randnum = my_rand_buffer[jj]; + + flt_t iwd = rinv - icut; + if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0; + + if (!ONETYPE) { + a0 = parami[jtype].a0; + gamma = parami[jtype].gamma; + sigma = parami[jtype].sigma; + } + flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt; + if (!ONETYPE) fpair *= factor_dpd; + fpair *= iwd; + + const flt_t fpx = fpair * delx; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * dely; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * delz; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + flt_t cut = (flt_t)1.0/icut; + flt_t r = (flt_t)1.0/rinv; + evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut); + if (!ONETYPE) evdwl *= factor_dpd; + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl; + } + } + + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); + } // if rsq + } // for jj + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } + + IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); + rngi += jnum; + } // for ii + + IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); + rngi_thread[tid] = rngi; + } // end omp + + IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; + } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; + } + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end offload + + if (offload) + fix->stop_watch(TIME_OFFLOAD_LATENCY); + else + fix->stop_watch(TIME_HOST_PAIR); + + if (EFLAG || vflag) + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); + else + fix->add_result_array(f_start, 0, offload); +} + +/* ---------------------------------------------------------------------- + global settings + ------------------------------------------------------------------------- */ + +void PairDPDIntel::settings(int narg, char **arg) { + #if defined(_OPENMP) + if (_nrandom_thread) { + #ifdef LMP_NO_MKL_RNG + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; + #else + for (int i = 0; i < _nrandom_thread; i++) + vslDeleteStream(&random_thread[i]); + #endif + } + delete []random_thread; + #endif + PairDPD::settings(narg,arg); + _nrandom_thread = comm->nthreads; + + #ifdef LMP_NO_MKL_RNG + + random_thread =new RanMars*[comm->nthreads]; + random_thread[0] = random; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + if (tid > 0) + random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid); + } + #endif + + #else + + random_thread=new VSLStreamStatePtr[comm->nthreads]; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + vslNewStream(&random_thread[tid], LMP_MKL_RNG, + seed + comm->me + comm->nprocs * tid ); + } + #endif + + #endif +} + +/* ---------------------------------------------------------------------- */ + +void PairDPDIntel::init_style() +{ + PairDPD::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } + neighbor->requests[neighbor->nrequest-1]->intel = 1; + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + fix->pair_init_check(); + #ifdef _LMP_INTEL_OFFLOAD + if (fix->offload_balance() != 0.0) + error->all(FLERR, + "Offload for dpd/intel is not yet available. Set balance to 0."); + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairDPDIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + _onetype = 0; + if (atom->ntypes == 1 && !atom->molecular) _onetype = 1; + + int tp1 = atom->ntypes + 1; + fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop); + buffers->set_ntypes(tp1); + flt_t **cutneighsq = buffers->get_cutneighsq(); + + // Repeat cutsq calculation because done after call to init_style + double cut, cutneigh; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cutneigh = cut + neighbor->skin; + cutsq[i][j] = cutsq[j][i] = cut*cut; + cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; + double icut = 1.0 / cut; + fc.param[i][j].icut = fc.param[j][i].icut = icut; + } else { + cut = init_one(i,j); + double icut = 1.0 / cut; + fc.param[i][j].icut = fc.param[j][i].icut = icut; + } + } + } + + for (int i = 0; i < 4; i++) { + fc.special_lj[i] = force->special_lj[i]; + fc.special_lj[0] = 1.0; + } + + for (int i = 0; i < tp1; i++) { + for (int j = 0; j < tp1; j++) { + fc.param[i][j].a0 = a0[i][j]; + fc.param[i][j].gamma = gamma[i][j]; + fc.param[i][j].sigma = sigma[i][j]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairDPDIntel::ForceConst::set_ntypes(const int ntypes, + const int nthreads, + const int max_nbors, + Memory *memory, + const int cop) { + if (ntypes != _ntypes) { + if (_ntypes > 0) { + _memory->destroy(param); + _memory->destroy(rand_buffer_thread); + _memory->destroy(rngi); + } + if (ntypes > 0) { + _cop = cop; + memory->create(param,ntypes,ntypes,"fc.param"); + memory->create(rand_buffer_thread, nthreads, max_nbors, + "fc.rand_buffer_thread"); + memory->create(rngi,nthreads,"fc.param"); + for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors; + } + } + _ntypes = ntypes; + _memory = memory; +} + +/* ---------------------------------------------------------------------- + proc 0 reads from restart file, bcasts + ------------------------------------------------------------------------- */ + +void PairDPDIntel::read_restart_settings(FILE *fp) +{ + #if defined(_OPENMP) + if (_nrandom_thread) { + #ifdef LMP_NO_MKL_RNG + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; + #else + for (int i = 0; i < _nrandom_thread; i++) + vslDeleteStream(&random_thread[i]); + #endif + } + delete []random_thread; + #endif + PairDPD::read_restart_settings(fp); + _nrandom_thread = comm->nthreads; + + #ifdef LMP_NO_MKL_RNG + + random_thread =new RanMars*[comm->nthreads]; + random_thread[0] = random; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + if (tid > 0) + random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid); + } + #endif + + #else + + random_thread=new VSLStreamStatePtr[comm->nthreads]; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + vslNewStream(&random_thread[tid], LMP_MKL_RNG, + seed + comm->me + comm->nprocs * tid ); + } + #endif + + #endif +} diff --git a/src/USER-INTEL/pair_dpd_intel.h b/src/USER-INTEL/pair_dpd_intel.h new file mode 100644 index 0000000000..9181ff38f4 --- /dev/null +++ b/src/USER-INTEL/pair_dpd_intel.h @@ -0,0 +1,110 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) + Shun Xu (Computer Network Information Center, CAS) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(dpd/intel,PairDPDIntel) + +#else + +#ifndef LMP_PAIR_DPD_INTEL_H +#define LMP_PAIR_DPD_INTEL_H + +#include "pair_dpd.h" +#include "fix_intel.h" + +#ifdef LMP_NO_MKL_RNG +#include "random_mars.h" +#else +#include "mkl_vsl.h" +#endif + +namespace LAMMPS_NS { + +class PairDPDIntel : public PairDPD { + + public: + PairDPDIntel(class LAMMPS *); + ~PairDPDIntel(); + + virtual void compute(int, int); + void settings(int, char **); + void init_style(); + void read_restart_settings(FILE *); + + private: + FixIntel *fix; + int _cop, _onetype, _nrandom_thread; + + #ifdef LMP_NO_MKL_RNG + RanMars **random_thread; + #else + VSLStreamStatePtr *random_thread; + #endif + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int offload, const int vflag, + IntelBuffers * buffers, + const ForceConst &fc, const int astart, const int aend); + + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + // ---------------------------------------------------------------------- + + template + class ForceConst { + public: + typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1; + + _alignvar(flt_t special_lj[4],64); + fc_packed1 **param; + flt_t **rand_buffer_thread; + int *rngi; + + ForceConst() : _ntypes(0) {} + ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); } + + void set_ntypes(const int ntypes, const int nthreads, const int max_nbors, + Memory *memory, const int cop); + + private: + int _ntypes, _cop; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: The 'package intel' command is required for /intel styles + +Self-explanatory. + +*/ From 529eeb603923964e3853fbb272187d47042a93f6 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 2 Oct 2017 09:31:39 -0600 Subject: [PATCH 30/53] Reduce GPU data transfer --- src/KOKKOS/comm_kokkos.cpp | 8 +++++--- src/KOKKOS/neighbor_kokkos.cpp | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index ba44ea813f..a8b591e210 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -523,7 +523,7 @@ void CommKokkos::exchange_device() k_exchange_copylist.h_view(i) = sendpos; sendpos--; } else - k_exchange_copylist.h_view(i) = -1; + k_exchange_copylist.h_view(i) = -1; } k_exchange_copylist.modify(); @@ -916,8 +916,10 @@ void CommKokkos::borders_device() { if (exec_space == Host) k_sendlist.sync(); atomKK->modified(exec_space,ALL_MASK); - atomKK->sync(Host,TAG_MASK); - if (map_style) atom->map_set(); + if (map_style) { + atomKK->sync(Host,TAG_MASK); + atom->map_set(); + } } /* ---------------------------------------------------------------------- realloc the size of the send buffer as needed with BUFFACTOR and bufextra diff --git a/src/KOKKOS/neighbor_kokkos.cpp b/src/KOKKOS/neighbor_kokkos.cpp index 9a40808052..f34b149864 100644 --- a/src/KOKKOS/neighbor_kokkos.cpp +++ b/src/KOKKOS/neighbor_kokkos.cpp @@ -310,9 +310,9 @@ void NeighborKokkos::build_kokkos(int topoflag) // build pairwise lists for all perpetual NPair/NeighList // grow() with nlocal/nall args so that only realloc if have to - atomKK->sync(Host,ALL_MASK); for (i = 0; i < npair_perpetual; i++) { m = plist[i]; + if (!lists[m]->kokkos) atomKK->sync(Host,ALL_MASK); if (!lists[m]->copy) lists[m]->grow(nlocal,nall); neigh_pair[m]->build_setup(); neigh_pair[m]->build(lists[m]); From 8d384b9149c71d576fcea8f1b3f7cef54d3ec2ec Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 2 Oct 2017 15:03:48 -0400 Subject: [PATCH 31/53] whitespace cleanup --- src/dump.cpp | 6 +++--- src/modify.cpp | 56 +++++++++++++++++++++++++------------------------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/dump.cpp b/src/dump.cpp index 44098298ba..ddd958c25c 100644 --- a/src/dump.cpp +++ b/src/dump.cpp @@ -238,7 +238,7 @@ void Dump::init() int gcmcflag = 0; for (int i = 0; i < modify->nfix; i++) if ((strcmp(modify->fix[i]->style,"gcmc") == 0)) - gcmcflag = 1; + gcmcflag = 1; if (sortcol == 0 && atom->tag_consecutive() && !gcmcflag) { tagint *tag = atom->tag; @@ -898,7 +898,7 @@ void Dump::modify_params(int narg, char **arg) } else if (strcmp(arg[iarg],"fileper") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command"); if (!multiproc) - error->all(FLERR,"Cannot use dump_modify fileper " + error->all(FLERR,"Cannot use dump_modify fileper " "without % in dump file name"); int nper = force->inumeric(FLERR,arg[iarg+1]); if (nper <= 0) error->all(FLERR,"Illegal dump_modify command"); @@ -973,7 +973,7 @@ void Dump::modify_params(int narg, char **arg) } else if (strcmp(arg[iarg],"nfile") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command"); if (!multiproc) - error->all(FLERR,"Cannot use dump_modify nfile " + error->all(FLERR,"Cannot use dump_modify nfile " "without % in dump file name"); int nfile = force->inumeric(FLERR,arg[iarg+1]); if (nfile <= 0) error->all(FLERR,"Illegal dump_modify command"); diff --git a/src/modify.cpp b/src/modify.cpp index 4516788aa9..f723eb38fa 100644 --- a/src/modify.cpp +++ b/src/modify.cpp @@ -863,9 +863,9 @@ void Modify::add_fix(int narg, char **arg, int trysuffix) fix[ifix]->restart(state_restart_global[i]); used_restart_global[i] = 1; if (comm->me == 0) { - if (screen) + if (screen) fprintf(screen,"Resetting global fix info from restart file:\n"); - if (logfile) + if (logfile) fprintf(logfile,"Resetting global fix info from restart file:\n"); if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", fix[ifix]->style,fix[ifix]->id); @@ -885,9 +885,9 @@ void Modify::add_fix(int narg, char **arg, int trysuffix) fix[ifix]->unpack_restart(j,index_restart_peratom[i]); fix[ifix]->restart_reset = 1; if (comm->me == 0) { - if (screen) + if (screen) fprintf(screen,"Resetting peratom fix info from restart file:\n"); - if (logfile) + if (logfile) fprintf(logfile,"Resetting peratom fix info from restart file:\n"); if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", fix[ifix]->style,fix[ifix]->id); @@ -1409,24 +1409,24 @@ void Modify::restart_deallocate(int flag) if (flag && comm->me == 0) { int i; for (i = 0; i < nfix_restart_global; i++) - if (used_restart_global[i] == 0) break; + if (used_restart_global[i] == 0) break; if (i == nfix_restart_global) { - if (screen) + if (screen) fprintf(screen,"All restart file global fix info " "was re-assigned\n"); - if (logfile) + if (logfile) fprintf(logfile,"All restart file global fix info " "was re-assigned\n"); } else { - if (screen) fprintf(screen,"Unused restart file global fix info:\n"); - if (logfile) fprintf(logfile,"Unused restart file global fix info:\n"); - for (i = 0; i < nfix_restart_global; i++) { - if (used_restart_global[i]) continue; - if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", - style_restart_global[i],id_restart_global[i]); - if (logfile) fprintf(logfile," fix style: %s, fix ID: %s\n", - style_restart_global[i],id_restart_global[i]); - } + if (screen) fprintf(screen,"Unused restart file global fix info:\n"); + if (logfile) fprintf(logfile,"Unused restart file global fix info:\n"); + for (i = 0; i < nfix_restart_global; i++) { + if (used_restart_global[i]) continue; + if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", + style_restart_global[i],id_restart_global[i]); + if (logfile) fprintf(logfile," fix style: %s, fix ID: %s\n", + style_restart_global[i],id_restart_global[i]); + } } } @@ -1445,24 +1445,24 @@ void Modify::restart_deallocate(int flag) if (flag && comm->me == 0) { int i; for (i = 0; i < nfix_restart_peratom; i++) - if (used_restart_peratom[i] == 0) break; + if (used_restart_peratom[i] == 0) break; if (i == nfix_restart_peratom) { - if (screen) + if (screen) fprintf(screen,"All restart file peratom fix info " "was re-assigned\n"); - if (logfile) + if (logfile) fprintf(logfile,"All restart file peratom fix info " "was re-assigned\n"); } else { - if (screen) fprintf(screen,"Unused restart file peratom fix info:\n"); - if (logfile) fprintf(logfile,"Unused restart file peratom fix info:\n"); - for (i = 0; i < nfix_restart_peratom; i++) { - if (used_restart_peratom[i]) continue; - if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", - style_restart_peratom[i],id_restart_peratom[i]); - if (logfile) fprintf(logfile," fix style: %s, fix ID: %s\n", - style_restart_peratom[i],id_restart_peratom[i]); - } + if (screen) fprintf(screen,"Unused restart file peratom fix info:\n"); + if (logfile) fprintf(logfile,"Unused restart file peratom fix info:\n"); + for (i = 0; i < nfix_restart_peratom; i++) { + if (used_restart_peratom[i]) continue; + if (screen) fprintf(screen," fix style: %s, fix ID: %s\n", + style_restart_peratom[i],id_restart_peratom[i]); + if (logfile) fprintf(logfile," fix style: %s, fix ID: %s\n", + style_restart_peratom[i],id_restart_peratom[i]); + } } } From 2a24cbfe0c2f4158aeac7fa833f59f918dcfe811 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 2 Oct 2017 21:13:51 -0400 Subject: [PATCH 32/53] reverse logic for using MKL pRNG: this way, make serial and make mpi will compile LAMMPS with USER-INTEL installed --- src/MAKE/OPTIONS/Makefile.intel_coprocessor | 2 +- src/MAKE/OPTIONS/Makefile.intel_cpu | 3 +- src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi | 3 +- .../OPTIONS/Makefile.intel_knl_coprocessor | 6 +- src/USER-INTEL/pair_dpd_intel.cpp | 90 +++++++++---------- src/USER-INTEL/pair_dpd_intel.h | 16 ++-- 6 files changed, 62 insertions(+), 58 deletions(-) mode change 100755 => 100644 src/MAKE/OPTIONS/Makefile.intel_cpu diff --git a/src/MAKE/OPTIONS/Makefile.intel_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_coprocessor index a717be93ff..75e4d89170 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_coprocessor +++ b/src/MAKE/OPTIONS/Makefile.intel_coprocessor @@ -10,7 +10,7 @@ CC = mpiicpc MIC_OPT = -qoffload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\"" CCFLAGS = -g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \ -xHost -fno-alias -ansi-alias -restrict -DLMP_INTEL_USELRT \ - -qoverride-limits $(MIC_OPT) + -qoverride-limits $(MIC_OPT) -DLMP_USE_MKL_RNG SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu b/src/MAKE/OPTIONS/Makefile.intel_cpu old mode 100755 new mode 100644 index b7db064574..2c3cc51249 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu @@ -9,7 +9,8 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) + -fno-alias -ansi-alias -restrict $(OPTFLAGS) \ + -DLMP_USE_MKL_RNG SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi index 8a45b781f8..ff2d0cc5c2 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi @@ -9,7 +9,8 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT + -fno-alias -ansi-alias -restrict $(OPTFLAGS) \ + -DLMP_USE_MKL_RNG -DLMP_INTEL_USELRT SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor index 406e98b36d..769c166105 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor +++ b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor @@ -9,8 +9,10 @@ SHELL = /bin/sh CC = mpiicpc MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2 CCFLAGS = -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \ - -xHost -fno-alias -ansi-alias -restrict \ - -qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT + -xHost -fno-alias -ansi-alias -restrict \ + -qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT \ + -DLMP_USE_MKL_RNG + SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp index 0b5760a7b0..c7cddfccc1 100644 --- a/src/USER-INTEL/pair_dpd_intel.cpp +++ b/src/USER-INTEL/pair_dpd_intel.cpp @@ -47,12 +47,12 @@ PairDPDIntel::~PairDPDIntel() { #if defined(_OPENMP) if (_nrandom_thread) { - #ifdef LMP_NO_MKL_RNG - for (int i = 1; i < _nrandom_thread; i++) - delete random_thread[i]; - #else + #ifdef LMP_USE_MKL_RNG for (int i = 0; i < _nrandom_thread; i++) vslDeleteStream(&random_thread[i]); + #else + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; #endif } #endif @@ -216,10 +216,10 @@ void PairDPDIntel::eval(const int offload, const int vflag, iifrom += astart; iito += astart; - #ifdef LMP_NO_MKL_RNG - RanMars *my_random = random_thread[tid]; - #else + #ifdef LMP_USE_MKL_RNG VSLStreamStatePtr *my_random = &(random_thread[tid]); + #else + RanMars *my_random = random_thread[tid]; #endif flt_t *my_rand_buffer = fc.rand_buffer_thread[tid]; int rngi = rngi_thread[tid]; @@ -264,16 +264,16 @@ void PairDPDIntel::eval(const int offload, const int vflag, if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; if (rngi + jnum > rng_size) { - #ifdef LMP_NO_MKL_RNG - for (int jj = 0; jj < rngi; jj++) - my_rand_buffer[jj] = my_random->gaussian(); - #else + #ifdef LMP_USE_MKL_RNG if (sizeof(flt_t) == sizeof(float)) vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, (float*)my_rand_buffer, (float)0.0, (float)1.0 ); else vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, (double*)my_rand_buffer, 0.0, 1.0 ); + #else + for (int jj = 0; jj < rngi; jj++) + my_rand_buffer[jj] = my_random->gaussian(); #endif rngi = 0; } @@ -420,12 +420,12 @@ void PairDPDIntel::eval(const int offload, const int vflag, void PairDPDIntel::settings(int narg, char **arg) { #if defined(_OPENMP) if (_nrandom_thread) { - #ifdef LMP_NO_MKL_RNG - for (int i = 1; i < _nrandom_thread; i++) - delete random_thread[i]; - #else + #ifdef LMP_USE_MKL_RNG for (int i = 0; i < _nrandom_thread; i++) vslDeleteStream(&random_thread[i]); + #else + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; #endif } delete []random_thread; @@ -433,7 +433,19 @@ void PairDPDIntel::settings(int narg, char **arg) { PairDPD::settings(narg,arg); _nrandom_thread = comm->nthreads; - #ifdef LMP_NO_MKL_RNG + #ifdef LMP_USE_MKL_RNG + + random_thread=new VSLStreamStatePtr[comm->nthreads]; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + vslNewStream(&random_thread[tid], LMP_MKL_RNG, + seed + comm->me + comm->nprocs * tid ); + } + #endif + + #else random_thread =new RanMars*[comm->nthreads]; random_thread[0] = random; @@ -446,18 +458,6 @@ void PairDPDIntel::settings(int narg, char **arg) { } #endif - #else - - random_thread=new VSLStreamStatePtr[comm->nthreads]; - #if defined(_OPENMP) - #pragma omp parallel - { - int tid = omp_get_thread_num(); - vslNewStream(&random_thread[tid], LMP_MKL_RNG, - seed + comm->me + comm->nprocs * tid ); - } - #endif - #endif } @@ -575,12 +575,12 @@ void PairDPDIntel::read_restart_settings(FILE *fp) { #if defined(_OPENMP) if (_nrandom_thread) { - #ifdef LMP_NO_MKL_RNG - for (int i = 1; i < _nrandom_thread; i++) - delete random_thread[i]; - #else + #ifdef LMP_USE_MKL_RNG for (int i = 0; i < _nrandom_thread; i++) vslDeleteStream(&random_thread[i]); + #else + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; #endif } delete []random_thread; @@ -588,7 +588,19 @@ void PairDPDIntel::read_restart_settings(FILE *fp) PairDPD::read_restart_settings(fp); _nrandom_thread = comm->nthreads; - #ifdef LMP_NO_MKL_RNG + #ifdef LMP_USE_MKL_RNG + + random_thread=new VSLStreamStatePtr[comm->nthreads]; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + vslNewStream(&random_thread[tid], LMP_MKL_RNG, + seed + comm->me + comm->nprocs * tid ); + } + #endif + + #else random_thread =new RanMars*[comm->nthreads]; random_thread[0] = random; @@ -601,17 +613,5 @@ void PairDPDIntel::read_restart_settings(FILE *fp) } #endif - #else - - random_thread=new VSLStreamStatePtr[comm->nthreads]; - #if defined(_OPENMP) - #pragma omp parallel - { - int tid = omp_get_thread_num(); - vslNewStream(&random_thread[tid], LMP_MKL_RNG, - seed + comm->me + comm->nprocs * tid ); - } - #endif - #endif } diff --git a/src/USER-INTEL/pair_dpd_intel.h b/src/USER-INTEL/pair_dpd_intel.h index 9181ff38f4..416d873c00 100644 --- a/src/USER-INTEL/pair_dpd_intel.h +++ b/src/USER-INTEL/pair_dpd_intel.h @@ -28,10 +28,10 @@ PairStyle(dpd/intel,PairDPDIntel) #include "pair_dpd.h" #include "fix_intel.h" -#ifdef LMP_NO_MKL_RNG -#include "random_mars.h" -#else +#ifdef LMP_USE_MKL_RNG #include "mkl_vsl.h" +#else +#include "random_mars.h" #endif namespace LAMMPS_NS { @@ -46,15 +46,15 @@ class PairDPDIntel : public PairDPD { void settings(int, char **); void init_style(); void read_restart_settings(FILE *); - + private: FixIntel *fix; int _cop, _onetype, _nrandom_thread; - #ifdef LMP_NO_MKL_RNG - RanMars **random_thread; - #else + #ifdef LMP_USE_MKL_RNG VSLStreamStatePtr *random_thread; + #else + RanMars **random_thread; #endif template class ForceConst; @@ -86,7 +86,7 @@ class PairDPDIntel : public PairDPD { ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); } void set_ntypes(const int ntypes, const int nthreads, const int max_nbors, - Memory *memory, const int cop); + Memory *memory, const int cop); private: int _ntypes, _cop; From 466fde6443bf2c7c7b96502cc3ceecb0a24c979f Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 2 Oct 2017 21:20:26 -0400 Subject: [PATCH 33/53] update documentation for the reversal in the INTEL_MKL_RNG define --- doc/src/accelerate_intel.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt index c858ca0940..e585209cf5 100644 --- a/doc/src/accelerate_intel.txt +++ b/doc/src/accelerate_intel.txt @@ -82,10 +82,11 @@ this order :l The {newton} setting applies to all atoms, not just atoms shared between MPI tasks :l Vectorization can change the order for adding pairwise forces :l -Unless specified otherwise at build time, the random number -generator for dissipative particle dynamics uses the Mersenne -Twister generator (that should be more robust than the standard -generator) :l +When using the -DLMP_USE_MKL_RNG define (all included intel optimized +makefiles do) at build time, the random number generator for +dissipative particle dynamics (pair style dpd/intel) uses the Mersenne +Twister generator included in the Intel MKL library (that should be +more robust than the default Masaglia random number generator) :l :ule The precision mode (described below) used with the USER-INTEL From d2aa05cb3661497c70204ae8ea0822689123ebff Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 2 Oct 2017 21:24:51 -0400 Subject: [PATCH 34/53] update README in USER-INTEL for recent LRT logic reversal --- src/USER-INTEL/README | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README index 35cde38f15..edfc69120c 100644 --- a/src/USER-INTEL/README +++ b/src/USER-INTEL/README @@ -42,11 +42,11 @@ available. This allow for running most styles in LAMMPS with threading. ----------------------------------------------------------------------------- -The Long-Range Thread mode (LRT) in the Intel package currently uses -pthreads by default. If pthreads are not supported in the build environment, -the compile flag "-DLMP_INTEL_NOLRT" will disable the feature to allow for -builds without pthreads. Alternatively, "-DLMP_INTEL_LRT11" can be used to -build with compilers that support threads using the C++11 standard. When using +The Long-Range Thread mode (LRT) in the Intel package is enabled through the +-DLMP_INTEL_USELRT define at compile time. All intel optimized makefiles +include this define. This feature will use pthreads by default. +Alternatively, "-DLMP_INTEL_LRT11" can be used to build with compilers that +support threads intrinsically using the C++11 standard. When using LRT mode, you might need to disable OpenMP affinity settings (e.g. export KMP_AFFINITY=none). LAMMPS will generate a warning if the settings need to be changed. From 5e89269631263f7b800e6db09546f580d93b03a9 Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Mon, 2 Oct 2017 23:41:14 -0700 Subject: [PATCH 35/53] Minor adjustments to intel makefiles and documentation based on the reversed preprocessor logic and default memory align. Removing knl_coprocessor makefile. --- doc/src/accelerate_intel.txt | 42 +++--- src/MAKE/MACHINES/Makefile.cori2 | 7 +- src/MAKE/OPTIONS/Makefile.intel_cpu | 9 +- src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi | 5 +- src/MAKE/OPTIONS/Makefile.intel_cpu_mpich | 6 +- src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi | 8 +- .../OPTIONS/Makefile.intel_knl_coprocessor | 125 ------------------ src/MAKE/OPTIONS/Makefile.knl | 6 +- src/USER-INTEL/README | 11 +- src/USER-INTEL/verlet_lrt_intel.cpp | 2 +- src/USER-INTEL/verlet_lrt_intel.h | 5 +- 11 files changed, 50 insertions(+), 176 deletions(-) delete mode 100644 src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt index e585209cf5..aaa38d7de2 100644 --- a/doc/src/accelerate_intel.txt +++ b/doc/src/accelerate_intel.txt @@ -27,12 +27,12 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously. Angle Styles: charmm, harmonic :ulb,l Bond Styles: fene, fourier, harmonic :l Dihedral Styles: charmm, harmonic, opls :l -Fixes: nve, npt, nvt, nvt/sllod :l +Fixes: nve, npt, nvt, nvt/sllod, nve/asphere :l Improper Styles: cvff, harmonic :l Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, -lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo, -sw, tersoff :l +lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, +rebo, sw, tersoff :l K-Space Styles: pppm, pppm/disp :l :ule @@ -54,11 +54,12 @@ warmup run (for use with offload benchmarks). :c,image(JPG/user_intel.png) Results are speedups obtained on Intel Xeon E5-2697v4 processors -(code-named Broadwell) and Intel Xeon Phi 7250 processors -(code-named Knights Landing) with "June 2017" LAMMPS built with -Intel Parallel Studio 2017 update 2. Results are with 1 MPI task -per physical core. See {src/USER-INTEL/TEST/README} for the raw -simulation rates and instructions to reproduce. +(code-named Broadwell), Intel Xeon Phi 7250 processors (code-named +Knights Landing), and Intel Xeon Gold 6148 processors (code-named +Skylake) with "June 2017" LAMMPS built with Intel Parallel Studio +2017 update 2. Results are with 1 MPI task per physical core. See +{src/USER-INTEL/TEST/README} for the raw simulation rates and +instructions to reproduce. :line @@ -113,7 +114,7 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l For some of the simple 2-body potentials without long-range electrostatics, performance and scalability can be better with the "newton off" setting added to the input script :l -For simulations on higher node counts, add "processors * * * grid +For simulations on higher node counts, add "processors * * * grid numa" to the beginning of the input script for better scalability :l If using {kspace_style pppm} in the input script, add "kspace_modify diff ad" for better performance :l @@ -124,8 +125,8 @@ For Intel Xeon Phi CPUs: Runs should be performed using MCDRAM. :ulb,l :ule -For simulations using {kspace_style pppm} on Intel CPUs -supporting AVX-512: +For simulations using {kspace_style pppm} on Intel CPUs supporting +AVX-512: Add "kspace_modify diff ad" to the input script :ulb,l The command-line option should be changed to @@ -242,14 +243,17 @@ However, if you do not have coprocessors on your system, building without offload support will produce a smaller binary. The general requirements for Makefiles with the USER-INTEL package -are as follows. "-DLAMMPS_MEMALIGN=64" is required for CCFLAGS. When -using Intel compilers, "-restrict" is required and "-qopenmp" is -highly recommended for CCFLAGS and LINKFLAGS. LIB should include -"-ltbbmalloc". For builds supporting offload, "-DLMP_INTEL_OFFLOAD" -is required for CCFLAGS and "-qoffload" is required for LINKFLAGS. -Other recommended CCFLAG options for best performance are -"-O2 -fno-alias -ansi-alias -qoverride-limits fp-model fast=2 --no-prec-div". +are as follows. When using Intel compilers, "-restrict" is required +and "-qopenmp" is highly recommended for CCFLAGS and LINKFLAGS. +CCFLAGS should include "-DLMP_INTEL_USELRT" (unless POSIX Threads +are not supported in the build environment) and "-DLMP_USE_MKL_RNG" +(unless Intel Math Kernel Library (MKL) is not available in the build +environment). For Intel compilers, LIB should include "-ltbbmalloc" +or if the library is not available, "-DLMP_INTEL_NO_TBB" can be added +to CCFLAGS. For builds supporting offload, "-DLMP_INTEL_OFFLOAD" is +required for CCFLAGS and "-qoffload" is required for LINKFLAGS. Other +recommended CCFLAG options for best performance are "-O2 -fno-alias +-ansi-alias -qoverride-limits fp-model fast=2 -no-prec-div". NOTE: The vectorization and math capabilities can differ depending on the CPU. For Intel compilers, the "-x" flag specifies the type of diff --git a/src/MAKE/MACHINES/Makefile.cori2 b/src/MAKE/MACHINES/Makefile.cori2 index a367d54080..45e1ab1f8a 100755 --- a/src/MAKE/MACHINES/Makefile.cori2 +++ b/src/MAKE/MACHINES/Makefile.cori2 @@ -15,13 +15,14 @@ SHELL = /bin/sh CC = CC OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_NO_TBB +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG -DLMP_INTEL_NO_TBB \ + $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = CC -LINKFLAGS = -g -qopenmp $(OPTFLAGS) +LINKFLAGS = -qopenmp $(OPTFLAGS) LIB = SIZE = size diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu b/src/MAKE/OPTIONS/Makefile.intel_cpu index 2c3cc51249..41d0f959fe 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu @@ -8,15 +8,14 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) \ - -DLMP_USE_MKL_RNG +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpiicpc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) -LIB = -ltbbmalloc -ltbbmalloc_proxy +LINKFLAGS = -qopenmp $(OPTFLAGS) +LIB = -ltbbmalloc SIZE = size ARCHIVE = ar diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi index ff2d0cc5c2..ef514f43c6 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi @@ -8,9 +8,8 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) \ - -DLMP_USE_MKL_RNG -DLMP_INTEL_USELRT +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich index 40d517bce4..68f879860a 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich @@ -8,13 +8,13 @@ SHELL = /bin/sh CC = mpicxx -cxx=icc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpicxx -cxx=icc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) +LINKFLAGS = -qopenmp $(OPTFLAGS) LIB = SIZE = size diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi index fe1be99e58..457a64b223 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi @@ -9,14 +9,14 @@ SHELL = /bin/sh export OMPI_CXX = icc CC = mpicxx OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpicxx -LINKFLAGS = -g -qopenmp $(OPTFLAGS) -LIB = -ltbbmalloc -ltbbmalloc_proxy +LINKFLAGS = -qopenmp $(OPTFLAGS) +LIB = -ltbbmalloc SIZE = size ARCHIVE = ar diff --git a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor deleted file mode 100644 index 769c166105..0000000000 --- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor +++ /dev/null @@ -1,125 +0,0 @@ -# intel_phi = USER-INTEL with Phi x200 (KNL) offload support,Intel MPI,MKL FFT - -SHELL = /bin/sh - -# --------------------------------------------------------------------- -# compiler/linker settings -# specify flags and libraries needed for your compiler - -CC = mpiicpc -MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2 -CCFLAGS = -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \ - -xHost -fno-alias -ansi-alias -restrict \ - -qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT \ - -DLMP_USE_MKL_RNG - -SHFLAGS = -fPIC -DEPFLAGS = -M - -LINK = mpiicpc -LINKFLAGS = -g -O3 -xHost -qopenmp -qoffload $(MIC_OPT) -LIB = -ltbbmalloc -SIZE = size - -ARCHIVE = ar -ARFLAGS = -rc -SHLIBFLAGS = -shared - -# --------------------------------------------------------------------- -# LAMMPS-specific settings, all OPTIONAL -# specify settings for LAMMPS features you will use -# if you change any -D setting, do full re-compile after "make clean" - -# LAMMPS ifdef settings -# see possible settings in Section 2.2 (step 4) of manual - -LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG - -# MPI library -# see discussion in Section 2.2 (step 5) of manual -# MPI wrapper compiler/linker can provide this info -# can point to dummy MPI library in src/STUBS as in Makefile.serial -# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts -# INC = path for mpi.h, MPI compiler settings -# PATH = path for MPI library -# LIB = name of MPI library - -MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -MPI_PATH = -MPI_LIB = - -# FFT library -# see discussion in Section 2.2 (step 6) of manaul -# can be left blank to use provided KISS FFT library -# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings -# PATH = path for FFT library -# LIB = name of FFT library - -FFT_INC = -DFFT_MKL -DFFT_SINGLE -FFT_PATH = -FFT_LIB = -L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core - -# JPEG and/or PNG library -# see discussion in Section 2.2 (step 7) of manual -# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC -# INC = path(s) for jpeglib.h and/or png.h -# PATH = path(s) for JPEG library and/or PNG library -# LIB = name(s) of JPEG library and/or PNG library - -JPG_INC = -JPG_PATH = -JPG_LIB = -ljpeg - -# --------------------------------------------------------------------- -# build rules and dependencies -# do not edit this section - -include Makefile.package.settings -include Makefile.package - -EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) -EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) -EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) -EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS) -EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS) - -# Path to src files - -vpath %.cpp .. -vpath %.h .. - -# Link target - -$(EXE): $(OBJ) $(EXTRA_LINK_DEPENDS) - $(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE) - $(SIZE) $(EXE) - -# Library targets - -lib: $(OBJ) $(EXTRA_LINK_DEPENDS) - $(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ) - -shlib: $(OBJ) $(EXTRA_LINK_DEPENDS) - $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \ - $(OBJ) $(EXTRA_LIB) $(LIB) - -# Compilation rules - -%.o:%.cpp $(EXTRA_CPP_DEPENDS) - $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< - -%.d:%.cpp $(EXTRA_CPP_DEPENDS) - $(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@ - -%.o:%.cu $(EXTRA_CPP_DEPENDS) - $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< - -# Individual dependencies - -depend : fastdep.exe $(SRC) - @./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1 - -fastdep.exe: ../DEPEND/fastdep.c - cc -O -o $@ $< - -sinclude .depend diff --git a/src/MAKE/OPTIONS/Makefile.knl b/src/MAKE/OPTIONS/Makefile.knl index 881c51f0e4..8e266a4fce 100644 --- a/src/MAKE/OPTIONS/Makefile.knl +++ b/src/MAKE/OPTIONS/Makefile.knl @@ -8,13 +8,13 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpiicpc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) +LINKFLAGS = -qopenmp $(OPTFLAGS) LIB = -ltbbmalloc SIZE = size diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README index edfc69120c..871d881f39 100644 --- a/src/USER-INTEL/README +++ b/src/USER-INTEL/README @@ -53,12 +53,11 @@ need to be changed. ----------------------------------------------------------------------------- -The random number generator for Dissipative Particle Dynamics (DPD) in the -Intel package uses the Mersenne Twister pseudorandom number generator as -implemented in the Intel Math Kernel Library (MKL). This generator is faster -and more robust with a significantly longer period than the default DPD -generator. However, if MKL is not installed, the standard random number -generator can be used by adding the compile flag "-DLMP_NO_MKL_RNG". +Unless Intel Math Kernel Library (MKL) is unavailable, -DLMP_USE_MKL_RNG +should be added to the compile flags. This will enable using the MKL Mersenne +Twister random number generator (RNG) for Dissipative Particle Dynamics +(DPD). This RNG can allow significantly faster performance and it also has a +significantly longer period than the standard RNG for DPD. ----------------------------------------------------------------------------- diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp index 81f4586143..9ff5f85176 100644 --- a/src/USER-INTEL/verlet_lrt_intel.cpp +++ b/src/USER-INTEL/verlet_lrt_intel.cpp @@ -68,7 +68,7 @@ void VerletLRTIntel::init() _intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0)); - #ifdef LMP_INTEL_NOLRT + #ifndef LMP_INTEL_USELRT error->all(FLERR, "LRT otion for Intel package disabled at compile time"); #endif diff --git a/src/USER-INTEL/verlet_lrt_intel.h b/src/USER-INTEL/verlet_lrt_intel.h index 813cd53605..0d7154ff64 100644 --- a/src/USER-INTEL/verlet_lrt_intel.h +++ b/src/USER-INTEL/verlet_lrt_intel.h @@ -23,10 +23,7 @@ IntegrateStyle(verlet/lrt/intel,VerletLRTIntel) #include "verlet.h" #include "pppm_intel.h" -#ifndef LMP_INTEL_USELRT -#define LMP_INTEL_NOLRT -#else - +#ifdef LMP_INTEL_USELRT #ifdef LMP_INTEL_LRT11 #define _LMP_INTEL_LRT_11 #include From 9dc42fd4db713cb74d52697d0e1af2f6404867e3 Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Mon, 2 Oct 2017 23:53:05 -0700 Subject: [PATCH 36/53] intel_simd.h is currently also needed by dihedral/charmm, not just sw. --- src/USER-INTEL/Install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh index f7163e6791..da553d158a 100644 --- a/src/USER-INTEL/Install.sh +++ b/src/USER-INTEL/Install.sh @@ -46,7 +46,7 @@ action nbin_intel.h action nbin_intel.cpp action npair_intel.h action npair_intel.cpp -action intel_simd.h pair_sw_intel.cpp +action intel_simd.h action intel_intrinsics.h pair_tersoff_intel.cpp action intel_intrinsics_airebo.h pair_airebo_intel.cpp From 197f08278442df28c96eaaf2a3b9f350d7432dae Mon Sep 17 00:00:00 2001 From: James Barnett Date: Tue, 3 Oct 2017 11:15:44 -0400 Subject: [PATCH 37/53] cmake: Add -restrict for Intel compilers for some packages Some packages (USER-OMP, OPT, and USER-INTEL) require the -restrict flag when using the Intel compiler. --- cmake/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index bc33da60de..9a74a788d0 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -150,6 +150,11 @@ if(ENABLE_USER-OMP OR ENABLE_KOKKOS OR ENABLE_USER-INTEL) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif() +if((ENABLE_USER-OMP OR ENABLE_OPT OR ENABLE_USER-INTEL) AND + (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")) + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict") +endif() + if(ENABLE_KSPACE) set(FFT "KISSFFT" CACHE STRING "FFT library for KSPACE package") set_property(CACHE FFT PROPERTY STRINGS KISSFFT FFTW3 MKL FFTW2) From ca032f21fbfa9f5c3de41e09b7c94be220ebfc07 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 3 Oct 2017 10:14:24 -0600 Subject: [PATCH 38/53] Add Kokkos threaded reverse comm option --- doc/src/package.txt | 12 +- src/KOKKOS/atom_vec_atomic_kokkos.cpp | 444 ------------------- src/KOKKOS/atom_vec_atomic_kokkos.h | 18 - src/KOKKOS/atom_vec_bond_kokkos.cpp | 442 ------------------- src/KOKKOS/atom_vec_bond_kokkos.h | 18 - src/KOKKOS/atom_vec_charge_kokkos.cpp | 391 ----------------- src/KOKKOS/atom_vec_charge_kokkos.h | 18 - src/KOKKOS/atom_vec_dpd_kokkos.h | 3 - src/KOKKOS/atom_vec_full_kokkos.cpp | 446 -------------------- src/KOKKOS/atom_vec_full_kokkos.h | 18 - src/KOKKOS/atom_vec_kokkos.cpp | 586 ++++++++++++++++++++++++++ src/KOKKOS/atom_vec_kokkos.h | 41 +- src/KOKKOS/comm_kokkos.cpp | 119 ++++-- src/KOKKOS/comm_kokkos.h | 7 +- src/KOKKOS/kokkos.cpp | 27 +- src/KOKKOS/kokkos.h | 2 + src/comm_brick.cpp | 6 +- 17 files changed, 748 insertions(+), 1850 deletions(-) diff --git a/doc/src/package.txt b/doc/src/package.txt index 58f6a5e34d..5c698934e8 100644 --- a/doc/src/package.txt +++ b/doc/src/package.txt @@ -62,7 +62,7 @@ args = arguments specific to the style :l {no_affinity} values = none {kokkos} args = keyword value ... zero or more keyword/value pairs may be appended - keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} + keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} {neigh} value = {full} or {half} full = full neighbor list half = half neighbor list built in thread-safe manner @@ -75,9 +75,10 @@ args = arguments specific to the style :l {binsize} value = size size = bin size for neighbor list construction (distance units) {comm} value = {no} or {host} or {device} - use value for both comm/exchange and comm/forward + use value for comm/exchange and comm/forward and comm/reverse {comm/exchange} value = {no} or {host} or {device} {comm/forward} value = {no} or {host} or {device} + {comm/reverse} value = {no} or {host} or {device} no = perform communication pack/unpack in non-KOKKOS mode host = perform pack/unpack on host (e.g. with OpenMP threading) device = perform pack/unpack on device (e.g. on GPU) @@ -429,17 +430,18 @@ Coulombic solver"_kspace_style.html because the GPU is faster at performing pairwise interactions, then this rule of thumb may give too large a binsize. -The {comm} and {comm/exchange} and {comm/forward} keywords determine +The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} keywords determine whether the host or device performs the packing and unpacking of data when communicating per-atom data between processors. "Exchange" communication happens only on timesteps that neighbor lists are rebuilt. The data is only for atoms that migrate to new processors. -"Forward" communication happens every timestep. The data is for atom +"Forward" communication happens every timestep. "Reverse" communication +happens every timestep if the {newton} option is on. The data is for atom coordinates and any other atom properties that needs to be updated for ghost atoms owned by each processor. The {comm} keyword is simply a short-cut to set the same value -for both the {comm/exchange} and {comm/forward} keywords. +for both the {comm/exchange} and {comm/forward} and {comm/reverse} keywords. The value options for all 3 keywords are {no} or {host} or {device}. A value of {no} means to use the standard non-KOKKOS method of diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp index b63dc5fb8c..6c610c8c11 100644 --- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp +++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp @@ -136,450 +136,6 @@ void AtomVecAtomicKokkos::copy(int i, int j, int delflag) /* ---------------------------------------------------------------------- */ -template -struct AtomVecAtomicKokkos_PackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_xfloat_2d_um _buf; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecAtomicKokkos_PackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - const size_t maxsend = (buf.view().dimension_0()*buf.view().dimension_1())/3; - const size_t elements = 3; - buffer_view(_buf,buf,maxsend,elements); - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _buf(i,0) = _x(j,0); - _buf(i,1) = _x(j,1); - _buf(i,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n, - const DAT::tdual_int_2d &list, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, - const int* const pbc) -{ - // Check whether to always run forward communication on the host - // Choose correct forward PackComm kernel - - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - - return n*size_forward; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecAtomicKokkos_PackCommSelf { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_x_array _xw; - int _nfirst; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecAtomicKokkos_PackCommSelf( - const typename DAT::tdual_x_array &x, - const int &nfirst, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_xw(x.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _xw(i+_nfirst,0) = _x(j,0); - _xw(i+_nfirst,1) = _x(j,1); - _xw(i+_nfirst,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, - const int nfirst, const int &pbc_flag, const int* const pbc) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecAtomicKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - return n*3; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecAtomicKokkos_UnpackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array _x; - typename ArrayTypes::t_xfloat_2d_const _buf; - int _first; - - AtomVecAtomicKokkos_UnpackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const int& first):_x(x.view()),_buf(buf.view()), - _first(first) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - _x(i+_first,0) = _buf(i,0); - _x(i+_first,1) = _buf(i,1); - _x(i+_first,2) = _buf(i,2); - } -}; - -/* ---------------------------------------------------------------------- */ - -void AtomVecAtomicKokkos::unpack_comm_kokkos(const int &n, const int &first, - const DAT::tdual_xfloat_2d &buf ) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - struct AtomVecAtomicKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - struct AtomVecAtomicKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_comm_vel(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz,dvx,dvy,dvz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - if (!deform_vremap) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; - dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; - dvz = pbc[2]*h_rate[2]; - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - if (mask[i] & deform_groupbit) { - buf[m++] = h_v(j,0) + dvx; - buf[m++] = h_v(j,1) + dvy; - buf[m++] = h_v(j,2) + dvz; - } else { - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecAtomicKokkos::unpack_comm(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecAtomicKokkos::unpack_comm_vel(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - h_v(i,0) = buf[m++]; - h_v(i,1) = buf[m++]; - h_v(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecAtomicKokkos::pack_reverse(int n, int first, double *buf) -{ - if(n > 0) - sync(Host,F_MASK); - - int m = 0; - const int last = first + n; - for (int i = first; i < last; i++) { - buf[m++] = h_f(i,0); - buf[m++] = h_f(i,1); - buf[m++] = h_f(i,2); - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecAtomicKokkos::unpack_reverse(int n, int *list, double *buf) -{ - if(n > 0) { - sync(Host,F_MASK); - modified(Host,F_MASK); - } - - int m = 0; - for (int i = 0; i < n; i++) { - const int j = list[i]; - h_f(j,0) += buf[m++]; - h_f(j,1) += buf[m++]; - h_f(j,2) += buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - template struct AtomVecAtomicKokkos_PackBorder { typedef DeviceType device_type; diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.h b/src/KOKKOS/atom_vec_atomic_kokkos.h index 5e9a72c2e3..e4d2654e2c 100644 --- a/src/KOKKOS/atom_vec_atomic_kokkos.h +++ b/src/KOKKOS/atom_vec_atomic_kokkos.h @@ -33,12 +33,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos { virtual ~AtomVecAtomicKokkos() {} void grow(int); void copy(int, int, int); - int pack_comm(int, int *, double *, int, int *); - int pack_comm_vel(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); - void unpack_comm_vel(int, int, double *); - int pack_reverse(int, int, double *); - void unpack_reverse(int, int *, double *); int pack_border(int, int *, double *, int, int *); int pack_border_vel(int, int *, double *, int, int *); void unpack_border(int, int, double *); @@ -55,15 +49,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos { bigint memory_usage(); void grow_reset(); - int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]); - void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf); - int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]); int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space); @@ -99,9 +84,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::tdual_int_1d k_count; }; diff --git a/src/KOKKOS/atom_vec_bond_kokkos.cpp b/src/KOKKOS/atom_vec_bond_kokkos.cpp index e0f29a27bb..076144420c 100644 --- a/src/KOKKOS/atom_vec_bond_kokkos.cpp +++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp @@ -178,448 +178,6 @@ void AtomVecBondKokkos::copy(int i, int j, int delflag) /* ---------------------------------------------------------------------- */ -template -struct AtomVecBondKokkos_PackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_xfloat_2d_um _buf; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecBondKokkos_PackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - const size_t maxsend = (buf.view().dimension_0()*buf.view().dimension_1())/3; - const size_t elements = 3; - buffer_view(_buf,buf,maxsend,elements); - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _buf(i,0) = _x(j,0); - _buf(i,1) = _x(j,1); - _buf(i,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_comm_kokkos(const int &n, - const DAT::tdual_int_2d &list, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, - const int* const pbc) -{ - // Check whether to always run forward communication on the host - // Choose correct forward PackComm kernel - - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - - return n*size_forward; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecBondKokkos_PackCommSelf { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_x_array _xw; - int _nfirst; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecBondKokkos_PackCommSelf( - const typename DAT::tdual_x_array &x, - const int &nfirst, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_xw(x.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _xw(i+_nfirst,0) = _x(j,0); - _xw(i+_nfirst,1) = _x(j,1); - _xw(i+_nfirst,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, - const int nfirst, const int &pbc_flag, const int* const pbc) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecBondKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - return n*3; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecBondKokkos_UnpackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array _x; - typename ArrayTypes::t_xfloat_2d_const _buf; - int _first; - - AtomVecBondKokkos_UnpackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const int& first):_x(x.view()),_buf(buf.view()), - _first(first) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - _x(i+_first,0) = _buf(i,0); - _x(i+_first,1) = _buf(i,1); - _x(i+_first,2) = _buf(i,2); - } -}; - -/* ---------------------------------------------------------------------- */ - -void AtomVecBondKokkos::unpack_comm_kokkos(const int &n, const int &first, - const DAT::tdual_xfloat_2d &buf ) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - struct AtomVecBondKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - struct AtomVecBondKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_comm_vel(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz,dvx,dvy,dvz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - if (!deform_vremap) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; - dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; - dvz = pbc[2]*h_rate[2]; - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - if (mask[i] & deform_groupbit) { - buf[m++] = h_v(j,0) + dvx; - buf[m++] = h_v(j,1) + dvy; - buf[m++] = h_v(j,2) + dvz; - } else { - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecBondKokkos::unpack_comm(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecBondKokkos::unpack_comm_vel(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - h_v(i,0) = buf[m++]; - h_v(i,1) = buf[m++]; - h_v(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecBondKokkos::pack_reverse(int n, int first, double *buf) -{ - if(n > 0) - sync(Host,F_MASK); - - int m = 0; - const int last = first + n; - for (int i = first; i < last; i++) { - buf[m++] = h_f(i,0); - buf[m++] = h_f(i,1); - buf[m++] = h_f(i,2); - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecBondKokkos::unpack_reverse(int n, int *list, double *buf) -{ - if(n > 0) - modified(Host,F_MASK); - - int m = 0; - for (int i = 0; i < n; i++) { - const int j = list[i]; - h_f(j,0) += buf[m++]; - h_f(j,1) += buf[m++]; - h_f(j,2) += buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - template struct AtomVecBondKokkos_PackBorder { typedef DeviceType device_type; diff --git a/src/KOKKOS/atom_vec_bond_kokkos.h b/src/KOKKOS/atom_vec_bond_kokkos.h index 3dcc99fa78..7ec15450ef 100644 --- a/src/KOKKOS/atom_vec_bond_kokkos.h +++ b/src/KOKKOS/atom_vec_bond_kokkos.h @@ -32,12 +32,6 @@ class AtomVecBondKokkos : public AtomVecKokkos { virtual ~AtomVecBondKokkos() {} void grow(int); void copy(int, int, int); - int pack_comm(int, int *, double *, int, int *); - int pack_comm_vel(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); - void unpack_comm_vel(int, int, double *); - int pack_reverse(int, int, double *); - void unpack_reverse(int, int *, double *); int pack_border(int, int *, double *, int, int *); int pack_border_vel(int, int *, double *, int, int *); int pack_border_hybrid(int, int *, double *); @@ -59,15 +53,6 @@ class AtomVecBondKokkos : public AtomVecKokkos { bigint memory_usage(); void grow_reset(); - int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]); - void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf); - int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]); int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space); @@ -112,9 +97,6 @@ class AtomVecBondKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::t_tagint_1d d_molecule; DAT::t_int_2d d_nspecial; diff --git a/src/KOKKOS/atom_vec_charge_kokkos.cpp b/src/KOKKOS/atom_vec_charge_kokkos.cpp index 89f7e91c2b..7b8b74b405 100644 --- a/src/KOKKOS/atom_vec_charge_kokkos.cpp +++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp @@ -199,397 +199,6 @@ struct AtomVecChargeKokkos_PackComm { /* ---------------------------------------------------------------------- */ -int AtomVecChargeKokkos::pack_comm_kokkos(const int &n, - const DAT::tdual_int_2d &list, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, - const int* const pbc) -{ - // Check whether to always run forward communication on the host - // Choose correct forward PackComm kernel - - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackComm f(atomKK->k_x,buf,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - - return n*size_forward; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecChargeKokkos_PackCommSelf { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_x_array _xw; - int _nfirst; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecChargeKokkos_PackCommSelf( - const typename DAT::tdual_x_array &x, - const int &nfirst, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_xw(x.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _xw(i+_nfirst,0) = _x(j,0); - _xw(i+_nfirst,1) = _x(j,1); - _xw(i+_nfirst,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, - const int nfirst, const int &pbc_flag, const int* const pbc) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecChargeKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, - domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - return n*3; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecChargeKokkos_UnpackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array _x; - typename ArrayTypes::t_xfloat_2d_const _buf; - int _first; - - AtomVecChargeKokkos_UnpackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const int& first):_x(x.view()),_buf(buf.view()), - _first(first) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - _x(i+_first,0) = _buf(i,0); - _x(i+_first,1) = _buf(i,1); - _x(i+_first,2) = _buf(i,2); - } -}; - -/* ---------------------------------------------------------------------- */ - -void AtomVecChargeKokkos::unpack_comm_kokkos(const int &n, const int &first, - const DAT::tdual_xfloat_2d &buf ) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - struct AtomVecChargeKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - struct AtomVecChargeKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecChargeKokkos::pack_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecChargeKokkos::pack_comm_vel(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz,dvx,dvy,dvz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - if (!deform_vremap) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; - dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; - dvz = pbc[2]*h_rate[2]; - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - if (mask[i] & deform_groupbit) { - buf[m++] = h_v(j,0) + dvx; - buf[m++] = h_v(j,1) + dvy; - buf[m++] = h_v(j,2) + dvz; - } else { - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecChargeKokkos::unpack_comm(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecChargeKokkos::unpack_comm_vel(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - h_v(i,0) = buf[m++]; - h_v(i,1) = buf[m++]; - h_v(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecChargeKokkos::pack_reverse(int n, int first, double *buf) -{ - if(n > 0) - sync(Host,F_MASK); - - int m = 0; - const int last = first + n; - for (int i = first; i < last; i++) { - buf[m++] = h_f(i,0); - buf[m++] = h_f(i,1); - buf[m++] = h_f(i,2); - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecChargeKokkos::unpack_reverse(int n, int *list, double *buf) -{ - if(n > 0) - modified(Host,F_MASK); - - int m = 0; - for (int i = 0; i < n; i++) { - const int j = list[i]; - h_f(j,0) += buf[m++]; - h_f(j,1) += buf[m++]; - h_f(j,2) += buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - template struct AtomVecChargeKokkos_PackBorder { typedef DeviceType device_type; diff --git a/src/KOKKOS/atom_vec_charge_kokkos.h b/src/KOKKOS/atom_vec_charge_kokkos.h index f9b385e7ed..e9ff70bbe1 100644 --- a/src/KOKKOS/atom_vec_charge_kokkos.h +++ b/src/KOKKOS/atom_vec_charge_kokkos.h @@ -33,12 +33,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos { virtual ~AtomVecChargeKokkos() {} void grow(int); void copy(int, int, int); - int pack_comm(int, int *, double *, int, int *); - int pack_comm_vel(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); - void unpack_comm_vel(int, int, double *); - int pack_reverse(int, int, double *); - void unpack_reverse(int, int *, double *); int pack_border(int, int *, double *, int, int *); int pack_border_vel(int, int *, double *, int, int *); int pack_border_hybrid(int, int *, double *); @@ -60,15 +54,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos { bigint memory_usage(); void grow_reset(); - int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]); - void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf); - int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]); int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space); @@ -108,9 +93,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::t_float_1d d_q; diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.h b/src/KOKKOS/atom_vec_dpd_kokkos.h index 372404cc7d..cec1b82357 100644 --- a/src/KOKKOS/atom_vec_dpd_kokkos.h +++ b/src/KOKKOS/atom_vec_dpd_kokkos.h @@ -111,9 +111,6 @@ class AtomVecDPDKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::tdual_int_1d k_count; }; diff --git a/src/KOKKOS/atom_vec_full_kokkos.cpp b/src/KOKKOS/atom_vec_full_kokkos.cpp index fd7eaf7c81..8e9abe4067 100644 --- a/src/KOKKOS/atom_vec_full_kokkos.cpp +++ b/src/KOKKOS/atom_vec_full_kokkos.cpp @@ -307,452 +307,6 @@ void AtomVecFullKokkos::copy(int i, int j, int delflag) /* ---------------------------------------------------------------------- */ -template -struct AtomVecFullKokkos_PackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_xfloat_2d_um _buf; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecFullKokkos_PackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - const size_t maxsend = (buf.view().dimension_0() - *buf.view().dimension_1())/3; - const size_t elements = 3; - buffer_view(_buf,buf,maxsend,elements); - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _buf(i,0) = _x(j,0); - _buf(i,1) = _x(j,1); - _buf(i,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_comm_kokkos(const int &n, - const DAT::tdual_int_2d &list, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, - const int* const pbc) -{ - // Check whether to always run forward communication on the host - // Choose correct forward PackComm kernel - - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackComm - f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - - return n*size_forward; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecFullKokkos_PackCommSelf { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array_randomread _x; - typename ArrayTypes::t_x_array _xw; - int _nfirst; - typename ArrayTypes::t_int_2d_const _list; - const int _iswap; - X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; - X_FLOAT _pbc[6]; - - AtomVecFullKokkos_PackCommSelf( - const typename DAT::tdual_x_array &x, - const int &nfirst, - const typename DAT::tdual_int_2d &list, - const int & iswap, - const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, - const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): - _x(x.view()),_xw(x.view()),_nfirst(nfirst), - _list(list.view()),_iswap(iswap), - _xprd(xprd),_yprd(yprd),_zprd(zprd), - _xy(xy),_xz(xz),_yz(yz) { - _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; - _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; - }; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - const int j = _list(_iswap,i); - if (PBC_FLAG == 0) { - _xw(i+_nfirst,0) = _x(j,0); - _xw(i+_nfirst,1) = _x(j,1); - _xw(i+_nfirst,2) = _x(j,2); - } else { - if (TRICLINIC == 0) { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } else { - _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; - _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; - _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; - } - } - - } -}; - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, - const int nfirst, const int &pbc_flag, - const int* const pbc) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - if(pbc_flag) { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } else { - if(domain->triclinic) { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } else { - struct AtomVecFullKokkos_PackCommSelf - f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd, - domain->xy,domain->xz,domain->yz,pbc); - Kokkos::parallel_for(n,f); - } - } - } - return n*3; -} - -/* ---------------------------------------------------------------------- */ - -template -struct AtomVecFullKokkos_UnpackComm { - typedef DeviceType device_type; - - typename ArrayTypes::t_x_array _x; - typename ArrayTypes::t_xfloat_2d_const _buf; - int _first; - - AtomVecFullKokkos_UnpackComm( - const typename DAT::tdual_x_array &x, - const typename DAT::tdual_xfloat_2d &buf, - const int& first):_x(x.view()),_buf(buf.view()), - _first(first) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int& i) const { - _x(i+_first,0) = _buf(i,0); - _x(i+_first,1) = _buf(i,1); - _x(i+_first,2) = _buf(i,2); - } -}; - -/* ---------------------------------------------------------------------- */ - -void AtomVecFullKokkos::unpack_comm_kokkos(const int &n, const int &first, - const DAT::tdual_xfloat_2d &buf ) { - if(commKK->forward_comm_on_host) { - sync(Host,X_MASK); - modified(Host,X_MASK); - struct AtomVecFullKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } else { - sync(Device,X_MASK); - modified(Device,X_MASK); - struct AtomVecFullKokkos_UnpackComm f(atomKK->k_x,buf,first); - Kokkos::parallel_for(n,f); - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_comm_vel(int n, int *list, double *buf, - int pbc_flag, int *pbc) -{ - int i,j,m; - double dx,dy,dz,dvx,dvy,dvz; - - m = 0; - if (pbc_flag == 0) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0); - buf[m++] = h_x(j,1); - buf[m++] = h_x(j,2); - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - if (domain->triclinic == 0) { - dx = pbc[0]*domain->xprd; - dy = pbc[1]*domain->yprd; - dz = pbc[2]*domain->zprd; - } else { - dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; - dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; - dz = pbc[2]*domain->zprd; - } - if (!deform_vremap) { - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } else { - dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; - dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; - dvz = pbc[2]*h_rate[2]; - for (i = 0; i < n; i++) { - j = list[i]; - buf[m++] = h_x(j,0) + dx; - buf[m++] = h_x(j,1) + dy; - buf[m++] = h_x(j,2) + dz; - if (mask[i] & deform_groupbit) { - buf[m++] = h_v(j,0) + dvx; - buf[m++] = h_v(j,1) + dvy; - buf[m++] = h_v(j,2) + dvz; - } else { - buf[m++] = h_v(j,0); - buf[m++] = h_v(j,1); - buf[m++] = h_v(j,2); - } - } - } - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecFullKokkos::unpack_comm(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecFullKokkos::unpack_comm_vel(int n, int first, double *buf) -{ - int i,m,last; - - m = 0; - last = first + n; - for (i = first; i < last; i++) { - h_x(i,0) = buf[m++]; - h_x(i,1) = buf[m++]; - h_x(i,2) = buf[m++]; - h_v(i,0) = buf[m++]; - h_v(i,1) = buf[m++]; - h_v(i,2) = buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - -int AtomVecFullKokkos::pack_reverse(int n, int first, double *buf) -{ - if(n > 0) - sync(Host,F_MASK); - - int m = 0; - const int last = first + n; - for (int i = first; i < last; i++) { - buf[m++] = h_f(i,0); - buf[m++] = h_f(i,1); - buf[m++] = h_f(i,2); - } - return m; -} - -/* ---------------------------------------------------------------------- */ - -void AtomVecFullKokkos::unpack_reverse(int n, int *list, double *buf) -{ - if(n > 0) - modified(Host,F_MASK); - - int m = 0; - for (int i = 0; i < n; i++) { - const int j = list[i]; - h_f(j,0) += buf[m++]; - h_f(j,1) += buf[m++]; - h_f(j,2) += buf[m++]; - } -} - -/* ---------------------------------------------------------------------- */ - template struct AtomVecFullKokkos_PackBorder { typedef DeviceType device_type; diff --git a/src/KOKKOS/atom_vec_full_kokkos.h b/src/KOKKOS/atom_vec_full_kokkos.h index 760df087e1..33760a8b5f 100644 --- a/src/KOKKOS/atom_vec_full_kokkos.h +++ b/src/KOKKOS/atom_vec_full_kokkos.h @@ -32,12 +32,6 @@ class AtomVecFullKokkos : public AtomVecKokkos { virtual ~AtomVecFullKokkos() {} void grow(int); void copy(int, int, int); - int pack_comm(int, int *, double *, int, int *); - int pack_comm_vel(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); - void unpack_comm_vel(int, int, double *); - int pack_reverse(int, int, double *); - void unpack_reverse(int, int *, double *); int pack_border(int, int *, double *, int, int *); int pack_border_vel(int, int *, double *, int, int *); int pack_border_hybrid(int, int *, double *); @@ -59,15 +53,6 @@ class AtomVecFullKokkos : public AtomVecKokkos { bigint memory_usage(); void grow_reset(); - int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, - const int & iswap, - const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]); - void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf); - int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, - const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]); int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space); @@ -125,9 +110,6 @@ class AtomVecFullKokkos : public AtomVecKokkos { DAT::t_x_array d_x; DAT::t_v_array d_v; DAT::t_f_array d_f; - HAT::t_x_array h_x; - HAT::t_v_array h_v; - HAT::t_f_array h_f; DAT::t_float_1d d_q; HAT::t_float_1d h_q; diff --git a/src/KOKKOS/atom_vec_kokkos.cpp b/src/KOKKOS/atom_vec_kokkos.cpp index 5542991395..03fb2a4ead 100644 --- a/src/KOKKOS/atom_vec_kokkos.cpp +++ b/src/KOKKOS/atom_vec_kokkos.cpp @@ -12,6 +12,10 @@ ------------------------------------------------------------------------- */ #include "atom_vec_kokkos.h" +#include "atom_kokkos.h" +#include "comm_kokkos.h" +#include "domain.h" +#include "atom_masks.h" using namespace LAMMPS_NS; @@ -24,3 +28,585 @@ AtomVecKokkos::AtomVecKokkos(LAMMPS *lmp) : AtomVec(lmp) buffer_size = 0; } +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_PackComm { + typedef DeviceType device_type; + + typename ArrayTypes::t_x_array_randomread _x; + typename ArrayTypes::t_xfloat_2d_um _buf; + typename ArrayTypes::t_int_2d_const _list; + const int _iswap; + X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; + X_FLOAT _pbc[6]; + + AtomVecKokkos_PackComm( + const typename DAT::tdual_x_array &x, + const typename DAT::tdual_xfloat_2d &buf, + const typename DAT::tdual_int_2d &list, + const int & iswap, + const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, + const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): + _x(x.view()),_list(list.view()),_iswap(iswap), + _xprd(xprd),_yprd(yprd),_zprd(zprd), + _xy(xy),_xz(xz),_yz(yz) { + const size_t maxsend = (buf.view().dimension_0()*buf.view().dimension_1())/3; + const size_t elements = 3; + buffer_view(_buf,buf,maxsend,elements); + _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; + _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + if (PBC_FLAG == 0) { + _buf(i,0) = _x(j,0); + _buf(i,1) = _x(j,1); + _buf(i,2) = _x(j,2); + } else { + if (TRICLINIC == 0) { + _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; + _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; + _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; + } else { + _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; + _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; + _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; + } + } + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_comm_kokkos(const int &n, + const DAT::tdual_int_2d &list, + const int & iswap, + const DAT::tdual_xfloat_2d &buf, + const int &pbc_flag, + const int* const pbc) +{ + // Check whether to always run forward communication on the host + // Choose correct forward PackComm kernel + + if(commKK->forward_comm_on_host) { + sync(Host,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + } else { + sync(Device,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackComm f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + } + + return n*size_forward; +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_PackCommSelf { + typedef DeviceType device_type; + + typename ArrayTypes::t_x_array_randomread _x; + typename ArrayTypes::t_x_array _xw; + int _nfirst; + typename ArrayTypes::t_int_2d_const _list; + const int _iswap; + X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; + X_FLOAT _pbc[6]; + + AtomVecKokkos_PackCommSelf( + const typename DAT::tdual_x_array &x, + const int &nfirst, + const typename DAT::tdual_int_2d &list, + const int & iswap, + const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, + const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): + _x(x.view()),_xw(x.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap), + _xprd(xprd),_yprd(yprd),_zprd(zprd), + _xy(xy),_xz(xz),_yz(yz) { + _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; + _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + if (PBC_FLAG == 0) { + _xw(i+_nfirst,0) = _x(j,0); + _xw(i+_nfirst,1) = _x(j,1); + _xw(i+_nfirst,2) = _x(j,2); + } else { + if (TRICLINIC == 0) { + _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; + _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; + _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; + } else { + _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; + _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; + _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; + } + } + + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, + const int nfirst, const int &pbc_flag, const int* const pbc) { + if(commKK->forward_comm_on_host) { + sync(Host,X_MASK); + modified(Host,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + } else { + sync(Device,X_MASK); + modified(Device,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecKokkos_PackCommSelf f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + } + return n*3; +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_UnpackComm { + typedef DeviceType device_type; + + typename ArrayTypes::t_x_array _x; + typename ArrayTypes::t_xfloat_2d_const _buf; + int _first; + + AtomVecKokkos_UnpackComm( + const typename DAT::tdual_x_array &x, + const typename DAT::tdual_xfloat_2d &buf, + const int& first):_x(x.view()),_buf(buf.view()), + _first(first) {}; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + _x(i+_first,0) = _buf(i,0); + _x(i+_first,1) = _buf(i,1); + _x(i+_first,2) = _buf(i,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_comm_kokkos(const int &n, const int &first, + const DAT::tdual_xfloat_2d &buf ) { + if(commKK->forward_comm_on_host) { + sync(Host,X_MASK); + modified(Host,X_MASK); + struct AtomVecKokkos_UnpackComm f(atomKK->k_x,buf,first); + Kokkos::parallel_for(n,f); + } else { + sync(Device,X_MASK); + modified(Device,X_MASK); + struct AtomVecKokkos_UnpackComm f(atomKK->k_x,buf,first); + Kokkos::parallel_for(n,f); + } +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_comm(int n, int *list, double *buf, + int pbc_flag, int *pbc) +{ + int i,j,m; + double dx,dy,dz; + + m = 0; + if (pbc_flag == 0) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0); + buf[m++] = h_x(j,1); + buf[m++] = h_x(j,2); + } + } else { + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; + dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; + dz = pbc[2]*domain->zprd; + } + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + } + } + return m; +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_comm_vel(int n, int *list, double *buf, + int pbc_flag, int *pbc) +{ + int i,j,m; + double dx,dy,dz,dvx,dvy,dvz; + + m = 0; + if (pbc_flag == 0) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0); + buf[m++] = h_x(j,1); + buf[m++] = h_x(j,2); + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } else { + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; + dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; + dz = pbc[2]*domain->zprd; + } + if (!deform_vremap) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } else { + dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; + dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; + dvz = pbc[2]*h_rate[2]; + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + if (atom->mask[i] & deform_groupbit) { + buf[m++] = h_v(j,0) + dvx; + buf[m++] = h_v(j,1) + dvy; + buf[m++] = h_v(j,2) + dvz; + } else { + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } + } + } + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_comm(int n, int first, double *buf) +{ + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + h_x(i,0) = buf[m++]; + h_x(i,1) = buf[m++]; + h_x(i,2) = buf[m++]; + } +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_comm_vel(int n, int first, double *buf) +{ + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + h_x(i,0) = buf[m++]; + h_x(i,1) = buf[m++]; + h_x(i,2) = buf[m++]; + h_v(i,0) = buf[m++]; + h_v(i,1) = buf[m++]; + h_v(i,2) = buf[m++]; + } +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_PackReverse { + typedef DeviceType device_type; + + typename ArrayTypes::t_f_array_randomread _f; + typename ArrayTypes::t_ffloat_2d _buf; + int _first; + + AtomVecKokkos_PackReverse( + const typename DAT::tdual_f_array &f, + const typename DAT::tdual_ffloat_2d &buf, + const int& first):_f(f.view()),_buf(buf.view()), + _first(first) {}; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + _buf(i,0) = _f(i+_first,0); + _buf(i,1) = _f(i+_first,1); + _buf(i,2) = _f(i+_first,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_reverse_kokkos(const int &n, const int &first, + const DAT::tdual_ffloat_2d &buf ) { + if(commKK->reverse_comm_on_host) { + sync(Host,F_MASK); + struct AtomVecKokkos_PackReverse f(atomKK->k_f,buf,first); + Kokkos::parallel_for(n,f); + } else { + sync(Device,F_MASK); + struct AtomVecKokkos_PackReverse f(atomKK->k_f,buf,first); + Kokkos::parallel_for(n,f); + } + + return n*size_reverse; +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_UnPackReverseSelf { + typedef DeviceType device_type; + + typename ArrayTypes::t_f_array_randomread _f; + typename ArrayTypes::t_f_array _fw; + int _nfirst; + typename ArrayTypes::t_int_2d_const _list; + const int _iswap; + + AtomVecKokkos_UnPackReverseSelf( + const typename DAT::tdual_f_array &f, + const int &nfirst, + const typename DAT::tdual_int_2d &list, + const int & iswap): + _f(f.view()),_fw(f.view()),_nfirst(nfirst),_list(list.view()),_iswap(iswap) { + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + _fw(j,0) += _f(i+_nfirst,0); + _fw(j,1) += _f(i+_nfirst,1); + _fw(j,2) += _f(i+_nfirst,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, + const int nfirst) { + if(commKK->reverse_comm_on_host) { + sync(Host,F_MASK); + struct AtomVecKokkos_UnPackReverseSelf f(atomKK->k_f,nfirst,list,iswap); + Kokkos::parallel_for(n,f); + modified(Host,F_MASK); + } else { + sync(Device,F_MASK); + struct AtomVecKokkos_UnPackReverseSelf f(atomKK->k_f,nfirst,list,iswap); + Kokkos::parallel_for(n,f); + modified(Device,F_MASK); + } + return n*3; +} + +/* ---------------------------------------------------------------------- */ + +template +struct AtomVecKokkos_UnPackReverse { + typedef DeviceType device_type; + + typename ArrayTypes::t_f_array _f; + typename ArrayTypes::t_ffloat_2d_const _buf; + typename ArrayTypes::t_int_2d_const _list; + const int _iswap; + + AtomVecKokkos_UnPackReverse( + const typename DAT::tdual_f_array &f, + const typename DAT::tdual_ffloat_2d &buf, + const typename DAT::tdual_int_2d &list, + const int & iswap): + _f(f.view()),_list(list.view()),_iswap(iswap) { + const size_t maxsend = (buf.view().dimension_0()*buf.view().dimension_1())/3; + const size_t elements = 3; + buffer_view(_buf,buf,maxsend,elements); + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + _f(j,0) += _buf(i,0); + _f(j,1) += _buf(i,1); + _f(j,2) += _buf(i,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_reverse_kokkos(const int &n, + const DAT::tdual_int_2d &list, + const int & iswap, + const DAT::tdual_ffloat_2d &buf) +{ + // Check whether to always run reverse communication on the host + // Choose correct reverse UnPackReverse kernel + + if(commKK->reverse_comm_on_host) { + struct AtomVecKokkos_UnPackReverse f(atomKK->k_f,buf,list,iswap); + Kokkos::parallel_for(n,f); + modified(Host,F_MASK); + } else { + struct AtomVecKokkos_UnPackReverse f(atomKK->k_f,buf,list,iswap); + Kokkos::parallel_for(n,f); + modified(Device,F_MASK); + } +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecKokkos::pack_reverse(int n, int first, double *buf) +{ + if(n > 0) + sync(Host,F_MASK); + + int m = 0; + const int last = first + n; + for (int i = first; i < last; i++) { + buf[m++] = h_f(i,0); + buf[m++] = h_f(i,1); + buf[m++] = h_f(i,2); + } + + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecKokkos::unpack_reverse(int n, int *list, double *buf) +{ + int m = 0; + for (int i = 0; i < n; i++) { + const int j = list[i]; + h_f(j,0) += buf[m++]; + h_f(j,1) += buf[m++]; + h_f(j,2) += buf[m++]; + } + + if(n > 0) + modified(Host,F_MASK); +} diff --git a/src/KOKKOS/atom_vec_kokkos.h b/src/KOKKOS/atom_vec_kokkos.h index 7f593f235f..20a07ec443 100644 --- a/src/KOKKOS/atom_vec_kokkos.h +++ b/src/KOKKOS/atom_vec_kokkos.h @@ -35,29 +35,48 @@ class AtomVecKokkos : public AtomVec { public: AtomVecKokkos(class LAMMPS *); virtual ~AtomVecKokkos() {} + virtual int pack_comm(int, int *, double *, int, int *); + virtual int pack_comm_vel(int, int *, double *, int, int *); + virtual void unpack_comm(int, int, double *); + virtual void unpack_comm_vel(int, int, double *); + virtual int pack_reverse(int, int, double *); + virtual void unpack_reverse(int, int *, double *); virtual void sync(ExecutionSpace space, unsigned int mask) = 0; virtual void modified(ExecutionSpace space, unsigned int mask) = 0; - virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) {}; + virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) = 0; virtual int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, const int nfirst, - const int &pbc_flag, const int pbc[]) = 0; - //{return 0;} + const int &pbc_flag, const int pbc[]); + virtual int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &list, const int & iswap, const DAT::tdual_xfloat_2d &buf, - const int &pbc_flag, const int pbc[]) = 0; - //{return 0;} + const int &pbc_flag, const int pbc[]); + virtual void unpack_comm_kokkos(const int &n, const int &nfirst, - const DAT::tdual_xfloat_2d &buf) = 0; + const DAT::tdual_xfloat_2d &buf); + + virtual int + unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list, + const int & iswap, const int nfirst); + + virtual int + pack_reverse_kokkos(const int &n, const int &nfirst, + const DAT::tdual_ffloat_2d &buf); + + virtual void + unpack_reverse_kokkos(const int &n, const DAT::tdual_int_2d &list, + const int & iswap, const DAT::tdual_ffloat_2d &buf); + virtual int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space) = 0; - //{return 0;}; + virtual void unpack_border_kokkos(const int &n, const int &nfirst, const DAT::tdual_xfloat_2d &buf, @@ -68,15 +87,19 @@ class AtomVecKokkos : public AtomVec { DAT::tdual_int_1d k_sendlist, DAT::tdual_int_1d k_copylist, ExecutionSpace space, int dim, X_FLOAT lo, X_FLOAT hi) = 0; - //{return 0;}; + virtual int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv, int nlocal, int dim, X_FLOAT lo, X_FLOAT hi, ExecutionSpace space) = 0; - //{return 0;}; + protected: + HAT::t_x_array h_x; + HAT::t_v_array h_v; + HAT::t_f_array h_f; + class CommKokkos *commKK; size_t buffer_size; void* buffer; diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index a8b591e210..d4d348d7e2 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -62,7 +62,7 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp) tdual_int_1d("comm:k_exchange_sendlist",100); k_exchange_copylist = DAT:: tdual_int_1d("comm:k_exchange_copylist",100); - k_count = DAT::tdual_int_1d("comm:k_count",1); + k_count = DAT::tdual_int_scalar("comm:k_count"); k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100); memory->destroy(maxsendlist); @@ -103,8 +103,10 @@ void CommKokkos::init() atomKK = (AtomKokkos *) atom; exchange_comm_classic = lmp->kokkos->exchange_comm_classic; forward_comm_classic = lmp->kokkos->forward_comm_classic; + reverse_comm_classic = lmp->kokkos->reverse_comm_classic; exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host; forward_comm_on_host = lmp->kokkos->forward_comm_on_host; + reverse_comm_on_host = lmp->kokkos->reverse_comm_on_host; CommBrick::init(); @@ -133,8 +135,11 @@ void CommKokkos::init() if (force->newton == 0) check_reverse = 0; if (force->pair) check_reverse += force->pair->comm_reverse_off; - if(check_reverse || check_forward) + if (check_reverse || check_forward) forward_comm_classic = true; + + if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet + reverse_comm_classic = true; } /* ---------------------------------------------------------------------- @@ -174,7 +179,6 @@ void CommKokkos::forward_comm_device(int dummy) int n; MPI_Request request; AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; - double **x = atom->x; double *buf; // exchange data with another proc @@ -184,22 +188,17 @@ void CommKokkos::forward_comm_device(int dummy) k_sendlist.sync(); for (int iswap = 0; iswap < nswap; iswap++) { - if (sendproc[iswap] != me) { if (comm_x_only) { - atomKK->sync(ExecutionSpaceFromDevice::space,X_MASK); - if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]]; - else buf = NULL; - if (size_forward_recv[iswap]) { + atomKK->sync(ExecutionSpaceFromDevice::space,X_MASK); buf = atomKK->k_x.view().ptr_on_device() + firstrecv[iswap]*atomKK->k_x.view().dimension_1(); MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, - recvproc[iswap],0,world,&request); + recvproc[iswap],0,world,&request); } n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist, iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]); - if (n) { MPI_Send(k_buf_send.view().ptr_on_device(), n,MPI_DOUBLE,sendproc[iswap],0,world); @@ -249,21 +248,91 @@ void CommKokkos::forward_comm_device(int dummy) } } } + +/* ---------------------------------------------------------------------- + reverse communication of forces on atoms every timestep + other per-atom attributes may also be sent via pack/unpack routines +------------------------------------------------------------------------- */ + void CommKokkos::reverse_comm() { + if (!reverse_comm_classic) { + if (reverse_comm_on_host) reverse_comm_device(); + else reverse_comm_device(); + return; + } + k_sendlist.sync(); + if (comm_f_only) atomKK->sync(Host,F_MASK); else atomKK->sync(Host,ALL_MASK); + CommBrick::reverse_comm(); + if (comm_f_only) atomKK->modified(Host,F_MASK); else atomKK->modified(Host,ALL_MASK); - atomKK->sync(Device,ALL_MASK); + + atomKK->sync(Device,ALL_MASK); // is this needed? } +template +void CommKokkos::reverse_comm_device() +{ + int n; + MPI_Request request; + AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; + double *buf; + + // exchange data with another proc + // if other proc is self, just copy + // if comm_f_only set, exchange or copy directly from f, don't pack + + k_sendlist.sync(); + + for (int iswap = nswap-1; iswap >= 0; iswap--) { + if (sendproc[iswap] != me) { + if (comm_f_only) { + if (size_reverse_recv[iswap]) + MPI_Irecv(k_buf_recv.view().ptr_on_device(),size_reverse_recv[iswap],MPI_DOUBLE, + sendproc[iswap],0,world,&request); + if (size_reverse_send[iswap]) { + atomKK->sync(ExecutionSpaceFromDevice::space,F_MASK); + buf = atomKK->k_f.view().ptr_on_device() + + firstrecv[iswap]*atomKK->k_f.view().dimension_1(); + + MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE, + recvproc[iswap],0,world); + } + if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); + atomKK->modified(ExecutionSpaceFromDevice:: + space,F_MASK); + } else { + if (size_reverse_recv[iswap]) + MPI_Irecv(k_buf_recv.view().ptr_on_device(), + size_reverse_recv[iswap],MPI_DOUBLE, + sendproc[iswap],0,world,&request); + n = avec->pack_reverse_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send); + if (n) + MPI_Send(k_buf_send.view().ptr_on_device(),n, + MPI_DOUBLE,recvproc[iswap],0,world); + if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); + } + avec->unpack_reverse_kokkos(sendnum[iswap],k_sendlist,iswap, + k_buf_recv); + } else { + if (sendnum[iswap]) + n = avec->unpack_reverse_self(sendnum[iswap],k_sendlist,iswap, + firstrecv[iswap]); + } + } +} + +/* ---------------------------------------------------------------------- */ + void CommKokkos::forward_comm_fix(Fix *fix, int size) { k_sendlist.sync(); @@ -409,7 +478,7 @@ struct BuildExchangeListFunctor { typename AT::t_x_array _x; int _nlocal,_dim; - typename AT::t_int_1d _nsend; + typename AT::t_int_scalar _nsend; typename AT::t_int_1d _sendlist; typename AT::t_int_1d _sendflag; @@ -417,7 +486,7 @@ struct BuildExchangeListFunctor { BuildExchangeListFunctor( const typename AT::tdual_x_array x, const typename AT::tdual_int_1d sendlist, - typename AT::tdual_int_1d nsend, + typename AT::tdual_int_scalar nsend, typename AT::tdual_int_1d sendflag,int nlocal, int dim, X_FLOAT lo, X_FLOAT hi): _x(x.template view()), @@ -431,7 +500,7 @@ struct BuildExchangeListFunctor { KOKKOS_INLINE_FUNCTION void operator() (int i) const { if (_x(i,_dim) < _lo || _x(i,_dim) >= _hi) { - const int mysend=Kokkos::atomic_fetch_add(&_nsend(0),1); + const int mysend=Kokkos::atomic_fetch_add(&_nsend(),1); if(mysend<_sendlist.dimension_0()) { _sendlist(mysend) = i; _sendflag(i) = 1; @@ -490,9 +559,9 @@ void CommKokkos::exchange_device() if (true) { if (k_sendflag.h_view.dimension_0()(); - k_count.h_view(0) = k_exchange_sendlist.h_view.dimension_0(); - while (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) { - k_count.h_view(0) = 0; + k_count.h_view() = k_exchange_sendlist.h_view.dimension_0(); + while (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) { + k_count.h_view() = 0; k_count.modify(); k_count.sync(); @@ -505,10 +574,10 @@ void CommKokkos::exchange_device() k_count.modify(); k_count.sync(); - if (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) { - k_exchange_sendlist.resize(k_count.h_view(0)*1.1); - k_exchange_copylist.resize(k_count.h_view(0)*1.1); - k_count.h_view(0)=k_exchange_sendlist.h_view.dimension_0(); + if (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) { + k_exchange_sendlist.resize(k_count.h_view()*1.1); + k_exchange_copylist.resize(k_count.h_view()*1.1); + k_count.h_view()=k_exchange_sendlist.h_view.dimension_0(); } } k_exchange_copylist.sync(); @@ -516,8 +585,8 @@ void CommKokkos::exchange_device() k_sendflag.sync(); int sendpos = nlocal-1; - nlocal -= k_count.h_view(0); - for(int i = 0; i < k_count.h_view(0); i++) { + nlocal -= k_count.h_view(); + for(int i = 0; i < k_count.h_view(); i++) { if (k_exchange_sendlist.h_view(i)(); k_exchange_copylist.sync(); - nsend = k_count.h_view(0); + nsend = k_count.h_view(); if (nsend > maxsend) grow_send_kokkos(nsend,1); nsend = - avec->pack_exchange_kokkos(k_count.h_view(0),k_buf_send, + avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send, k_exchange_sendlist,k_exchange_copylist, ExecutionSpaceFromDevice:: space,dim,lo,hi); diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h index 4065efd000..f137655cb8 100644 --- a/src/KOKKOS/comm_kokkos.h +++ b/src/KOKKOS/comm_kokkos.h @@ -25,15 +25,17 @@ class CommKokkos : public CommBrick { bool exchange_comm_classic; bool forward_comm_classic; + bool reverse_comm_classic; bool exchange_comm_on_host; bool forward_comm_on_host; + bool reverse_comm_on_host; CommKokkos(class LAMMPS *); ~CommKokkos(); void init(); void forward_comm(int dummy = 0); // forward comm of atom coords - void reverse_comm(); // reverse comm of atom coords + void reverse_comm(); // reverse comm of atom coords void exchange(); // move atoms to new procs void borders(); // setup list of atoms to comm @@ -47,6 +49,7 @@ class CommKokkos : public CommBrick { void reverse_comm_dump(class Dump *); // reverse comm from a Dump template void forward_comm_device(int dummy); + template void reverse_comm_device(); template void forward_comm_pair_device(Pair *pair); template void exchange_device(); template void borders_device(); @@ -56,7 +59,7 @@ class CommKokkos : public CommBrick { DAT::tdual_int_scalar k_total_send; DAT::tdual_xfloat_2d k_buf_send,k_buf_recv; DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag; - DAT::tdual_int_1d k_count; + DAT::tdual_int_scalar k_count; //double *buf_send; // send buffer for all comm //double *buf_recv; // recv buffer for all comm diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index 072a802b54..2b02624dce 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -123,8 +123,10 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) neighflag_qeq_set = 0; exchange_comm_classic = 0; forward_comm_classic = 0; + reverse_comm_classic = 0; exchange_comm_on_host = 0; forward_comm_on_host = 0; + reverse_comm_on_host = 0; #ifdef KILL_KOKKOS_ON_SIGSEGV signal(SIGSEGV, my_signal_handler); @@ -158,8 +160,8 @@ void KokkosLMP::accelerator(int narg, char **arg) neighflag_qeq_set = 0; int newtonflag = 0; double binsize = 0.0; - exchange_comm_classic = forward_comm_classic = 0; - exchange_comm_on_host = forward_comm_on_host = 0; + exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; + exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; int iarg = 0; while (iarg < narg) { @@ -200,13 +202,13 @@ void KokkosLMP::accelerator(int narg, char **arg) } else if (strcmp(arg[iarg],"comm") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); if (strcmp(arg[iarg+1],"no") == 0) { - exchange_comm_classic = forward_comm_classic = 1; + exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1; } else if (strcmp(arg[iarg+1],"host") == 0) { - exchange_comm_classic = forward_comm_classic = 0; - exchange_comm_on_host = forward_comm_on_host = 1; + exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; + exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1; } else if (strcmp(arg[iarg+1],"device") == 0) { - exchange_comm_classic = forward_comm_classic = 0; - exchange_comm_on_host = forward_comm_on_host = 0; + exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; + exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else error->all(FLERR,"Illegal package kokkos command"); iarg += 2; } else if (strcmp(arg[iarg],"comm/exchange") == 0) { @@ -231,6 +233,17 @@ void KokkosLMP::accelerator(int narg, char **arg) forward_comm_on_host = 0; } else error->all(FLERR,"Illegal package kokkos command"); iarg += 2; + } else if (strcmp(arg[iarg],"comm/reverse") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); + if (strcmp(arg[iarg+1],"no") == 0) reverse_comm_classic = 1; + else if (strcmp(arg[iarg+1],"host") == 0) { + reverse_comm_classic = 0; + reverse_comm_on_host = 1; + } else if (strcmp(arg[iarg+1],"device") == 0) { + reverse_comm_classic = 0; + reverse_comm_on_host = 0; + } else error->all(FLERR,"Illegal package kokkos command"); + iarg += 2; } else error->all(FLERR,"Illegal package kokkos command"); } diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index 8e28b38cbf..7b7848f1f0 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -27,8 +27,10 @@ class KokkosLMP : protected Pointers { int neighflag_qeq_set; int exchange_comm_classic; int forward_comm_classic; + int reverse_comm_classic; int exchange_comm_on_host; int forward_comm_on_host; + int reverse_comm_on_host; int num_threads,ngpu; int numa; int auto_sync; diff --git a/src/comm_brick.cpp b/src/comm_brick.cpp index 3c972b8244..06227b7a84 100644 --- a/src/comm_brick.cpp +++ b/src/comm_brick.cpp @@ -476,8 +476,7 @@ void CommBrick::forward_comm(int dummy) if (sendproc[iswap] != me) { if (comm_x_only) { if (size_forward_recv[iswap]) { - if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]]; - else buf = NULL; + buf = x[firstrecv[iswap]]; MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); } @@ -547,8 +546,7 @@ void CommBrick::reverse_comm() MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE, sendproc[iswap],0,world,&request); if (size_reverse_send[iswap]) { - if (size_reverse_send[iswap]) buf = f[firstrecv[iswap]]; - else buf = NULL; + buf = f[firstrecv[iswap]]; MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE, recvproc[iswap],0,world); } From 2876baafd07d31c3a3ab30cc38b087ebcfc07eab Mon Sep 17 00:00:00 2001 From: James Barnett Date: Tue, 3 Oct 2017 13:08:56 -0400 Subject: [PATCH 39/53] Use -restrict whenever Intel is used, no matter the package --- cmake/CMakeLists.txt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 9a74a788d0..ca71c41ddb 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -37,6 +37,10 @@ enable_language(CXX) ##################################################################### include(CheckCCompilerFlag) +if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict") +endif() + ######################################################################## # User input options # ######################################################################## @@ -150,11 +154,6 @@ if(ENABLE_USER-OMP OR ENABLE_KOKKOS OR ENABLE_USER-INTEL) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif() -if((ENABLE_USER-OMP OR ENABLE_OPT OR ENABLE_USER-INTEL) AND - (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict") -endif() - if(ENABLE_KSPACE) set(FFT "KISSFFT" CACHE STRING "FFT library for KSPACE package") set_property(CACHE FFT PROPERTY STRINGS KISSFFT FFTW3 MKL FFTW2) From a55adf4a6848a734a492e2f5dc993041927db08a Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 3 Oct 2017 11:30:00 -0600 Subject: [PATCH 40/53] Update to Kokkos r2.04.04 and add workaround for performance regression --- lib/kokkos/CHANGELOG.md | 19 + lib/kokkos/Makefile.kokkos | 32 +- lib/kokkos/algorithms/src/Kokkos_Random.hpp | 237 ++++++++++++ lib/kokkos/algorithms/unit_tests/Makefile | 12 + lib/kokkos/algorithms/unit_tests/TestROCm.cpp | 112 ++++++ lib/kokkos/bin/hpcbind | 239 ++++++++---- lib/kokkos/bin/kokkos-bind | 221 ----------- lib/kokkos/bin/nvcc_wrapper | 15 +- lib/kokkos/config/master_history.txt | 1 + .../config/trilinos-integration/checkin-test | 2 +- .../containers/src/Kokkos_StaticCrsGraph.hpp | 149 ++++++++ .../KokkosExp_Cuda_IterateTile_Refactor.hpp | 160 ++++---- lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp | 4 +- lib/kokkos/core/src/Kokkos_Complex.hpp | 357 +++++++++++++----- lib/kokkos/core/src/Kokkos_Crs.hpp | 9 +- lib/kokkos/core/src/Kokkos_HBWSpace.hpp | 3 +- lib/kokkos/core/src/Kokkos_NumericTraits.hpp | 6 +- lib/kokkos/core/src/Kokkos_ROCm.hpp | 18 + lib/kokkos/core/src/Makefile | 1 + .../core/src/OpenMP/Kokkos_OpenMP_Exec.hpp | 1 + .../core/src/ROCm/Kokkos_ROCm_Reduce.hpp | 12 +- lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp | 4 +- lib/kokkos/core/src/impl/Kokkos_BitOps.hpp | 30 +- lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp | 4 - lib/kokkos/core/unit_test/TestComplex.hpp | 15 +- lib/kokkos/core/unit_test/TestMDRange.hpp | 121 ++++-- 26 files changed, 1222 insertions(+), 562 deletions(-) create mode 100644 lib/kokkos/algorithms/unit_tests/TestROCm.cpp delete mode 100755 lib/kokkos/bin/kokkos-bind diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index 43d3f17d63..d414056187 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,24 @@ # Change Log +## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04) + +**Implemented enhancements:** + +- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082) +- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071) +- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052) +- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019) +- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952) +- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857) + +**Fixed bugs:** + +- Fix reduction\_identity\::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048) +- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041) +- (Experimental) HBWSpace Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094) +- (Experimental) ROCm: algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070) + ## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16) [Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index b8236e8fd1..4641232a1f 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -443,7 +443,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib - KOKKOS_LIBS += -lmemkind + KOKKOS_LIBS += -lmemkind -lnuma tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp ) endif @@ -614,9 +614,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) else - # Assume that this is a really a GNU compiler or it could be XL on P8. - KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 - KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 + KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + + else + # Assume that this is a really a GNU compiler on P8. + KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 + KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + endif + endif endif endif @@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) else - # Assume that this is a really a GNU compiler or it could be XL on P9. - KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 - KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 + KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + + else + # Assume that this is a really a GNU compiler on P9 + KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 + KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + endif + endif endif endif diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 9082e47052..3db9a145d7 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool::free_state(const Random_XorShift102 } +#endif + +#if defined(KOKKOS_ENABLE_ROCM) + + template<> + class Random_XorShift1024 { + private: + int p_; + const int state_idx_; + uint64_t* state_; + const int stride_; + friend class Random_XorShift1024_Pool; + public: + + typedef Kokkos::Experimental::ROCm device_type; + typedef Random_XorShift1024_Pool pool_type; + + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast(0xffffffffU/2)}; + enum {MAX_RAND64 = static_cast(0xffffffffffffffffULL/2-1)}; + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0): + p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){ + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand() { + uint64_t state_0 = state_[ p_ * stride_ ]; + uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL; + tmp = tmp>>16; + return static_cast(tmp&MAX_URAND); + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64() { + uint64_t state_0 = state_[ p_ * stride_ ]; + uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range) { + const uint32_t max_val = (MAX_URAND/range)*range; + uint32_t tmp = urand(); + while(tmp>=max_val) + urand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ) { + return urand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range) { + const uint64_t max_val = (MAX_URAND64/range)*range; + uint64_t tmp = urand64(); + while(tmp>=max_val) + urand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ) { + return urand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int rand() { + return static_cast(urand()/2); + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& range) { + const int max_val = (MAX_RAND/range)*range; + int tmp = rand(); + while(tmp>=max_val) + rand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ) { + return rand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64() { + return static_cast(urand64()/2); + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range) { + const int64_t max_val = (MAX_RAND64/range)*range; + int64_t tmp = rand64(); + while(tmp>=max_val) + rand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ) { + return rand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + float frand() { + return 1.0f * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ) { + return frand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + double drand() { + return 1.0 * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ) { + return frand(end-start)+start; + } + + //Marsaglia polar method for drawing a standard normal distributed random number + KOKKOS_INLINE_FUNCTION + double normal() { + double S = 2.0; + double U; + while(S>=1.0) { + U = 2.0*drand() - 1.0; + const double V = 2.0*drand() - 1.0; + S = U*U+V*V; + } + return U*std::sqrt(-2.0*log(S)/S); + } + + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0) { + return mean + normal()*std_dev; + } + }; + +template<> +inline +Random_XorShift64_Pool::Random_XorShift64_Pool(uint64_t seed) { + num_states_ = 0; + init(seed,4*32768); +} + +template<> +KOKKOS_INLINE_FUNCTION +Random_XorShift64 Random_XorShift64_Pool::get_state() const { +#ifdef __HCC_ACCELERATOR__ + const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z; + int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) * + blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_; + while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) { + i+=blockDim_x*blockDim_y*blockDim_z; + if(i>=num_states_) {i = i_offset;} + } + + return Random_XorShift64(state_(i),i); +#else + return Random_XorShift64(state_(0),0); +#endif +} + +template<> +KOKKOS_INLINE_FUNCTION +void Random_XorShift64_Pool::free_state(const Random_XorShift64 &state) const { +#ifdef __HCC_ACCELERATOR__ + state_(state.state_idx_) = state.state_; + locks_(state.state_idx_) = 0; + return; +#endif +} + + +template<> +inline +Random_XorShift1024_Pool::Random_XorShift1024_Pool(uint64_t seed) { + num_states_ = 0; + init(seed,4*32768); +} + +template<> +KOKKOS_INLINE_FUNCTION +Random_XorShift1024 Random_XorShift1024_Pool::get_state() const { +#ifdef __HCC_ACCELERATOR__ + const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z; + int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) * + blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_; + while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) { + i+=blockDim_x*blockDim_y*blockDim_z; + if(i>=num_states_) {i = i_offset;} + } + + return Random_XorShift1024(state_, p_(i), i); +#else + return Random_XorShift1024(state_, p_(0), 0); +#endif +} + +template<> +KOKKOS_INLINE_FUNCTION +void Random_XorShift1024_Pool::free_state(const Random_XorShift1024 &state) const { +#ifdef __HCC_ACCELERATOR__ + for(int i=0; i<16; i++) + state_(state.state_idx_,i) = state.state_[i]; + locks_(state.state_idx_) = 0; + return; +#endif +} + + #endif diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile index b74192ef18..a5a10c82ee 100644 --- a/lib/kokkos/algorithms/unit_tests/Makefile +++ b/lib/kokkos/algorithms/unit_tests/Makefile @@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) TEST_TARGETS += test-cuda endif +ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) + OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_ROCm + TEST_TARGETS += test-rocm +endif + ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o TARGETS += KokkosAlgorithms_UnitTest_Threads @@ -51,6 +57,9 @@ endif KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda +KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm + KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads @@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) test-cuda: KokkosAlgorithms_UnitTest_Cuda ./KokkosAlgorithms_UnitTest_Cuda +test-rocm: KokkosAlgorithms_UnitTest_ROCm + ./KokkosAlgorithms_UnitTest_ROCm + test-threads: KokkosAlgorithms_UnitTest_Threads ./KokkosAlgorithms_UnitTest_Threads diff --git a/lib/kokkos/algorithms/unit_tests/TestROCm.cpp b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp new file mode 100644 index 0000000000..720b377ed2 --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp @@ -0,0 +1,112 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#ifdef KOKKOS_ENABLE_ROCM + +#include +#include +#include + +#include + +#include + +#include +#include + +namespace Test { + +class rocm : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) ); + } + static void TearDownTestCase() + { + Kokkos::Experimental::ROCm::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } +}; + +void rocm_test_random_xorshift64( int num_draws ) +{ + Impl::test_random >(num_draws); +} + +void rocm_test_random_xorshift1024( int num_draws ) +{ + Impl::test_random >(num_draws); +} + + +#define ROCM_RANDOM_XORSHIFT64( num_draws ) \ + TEST_F( rocm, Random_XorShift64 ) { \ + rocm_test_random_xorshift64(num_draws); \ + } + +#define ROCM_RANDOM_XORSHIFT1024( num_draws ) \ + TEST_F( rocm, Random_XorShift1024 ) { \ + rocm_test_random_xorshift1024(num_draws); \ + } + +#define ROCM_SORT_UNSIGNED( size ) \ + TEST_F( rocm, SortUnsigned ) { \ + Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size); \ + } + +ROCM_RANDOM_XORSHIFT64( 132141141 ) +ROCM_RANDOM_XORSHIFT1024( 52428813 ) +ROCM_SORT_UNSIGNED(171) + +#undef ROCM_RANDOM_XORSHIFT64 +#undef ROCM_RANDOM_XORSHIFT1024 +#undef ROCM_SORT_UNSIGNED +} +#else +void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {} +#endif /* #ifdef KOKKOS_ENABLE_ROCM */ + diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind index ca34648780..b88b334f8b 100755 --- a/lib/kokkos/bin/hpcbind +++ b/lib/kokkos/bin/hpcbind @@ -27,7 +27,7 @@ fi HPCBIND_HWLOC_PARENT_CPUSET="" if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then MY_PID="$BASHPID" - HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2) + HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)" fi ################################################################################ @@ -58,23 +58,34 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0)) ################################################################################ HPCBIND_QUEUE_NAME="" declare -i HPCBIND_QUEUE_INDEX=0 -declare -i HPCBIND_QUEUE_GPU_MAPPING=0 +declare -i HPCBIND_QUEUE_MAPPING=0 -if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then - HPCBIND_QUEUE_GPU_MAPPING=1 - HPCBIND_QUEUE_NAME="sbatch" +if [[ ! -z "${PMI_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="mpich" + HPCBIND_QUEUE_INDEX=${PMI_RANK} +elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="openmpi" + HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK} +elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="mvapich2" + HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK} +elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="slurm" HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID} elif [[ ! -z "${LBS_JOBINDEX}" ]]; then - HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="bsub" HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX} elif [[ ! -z "${ALPS_APP_PE}" ]]; then - HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="aprun" HPCBIND_QUEUE_INDEX=${ALPS_APP_PE} fi - ################################################################################ # Show help ################################################################################ @@ -91,13 +102,14 @@ function show_help { echo " --proc-bind= Set the initial process mask for the script" echo " LOC can be any valid location argument for" echo " hwloc-calc Default: all" + echo " --whole-system ${cmd} will ignore the its parent process binding" echo " --distribute=N Distribute the current cpuset into N partitions" echo " --distribute-partition=I" echo " Use the i'th partition (zero based)" echo " --visible-gpus= Comma separated list of gpu ids" echo " Default: CUDA_VISIBLE_DEVICES or all gpus in" echo " sequential order" - echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU" + echo " --ignore-queue Ignore queue job id when choosing visible GPU and partition" echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES" echo " --openmp=M.m Set env variables for the given OpenMP version" echo " Default: 4.0" @@ -110,22 +122,30 @@ function show_help { echo " --force-openmp-proc-bind=" echo " Override logic for selecting OMP_PROC_BIND" echo " --no-openmp-nested Set OMP_NESTED to false" - echo " --show-bindings Show the bindings" - echo " --lstopo Show bindings in lstopo without executing a command" - echo " -v|--verbose Show options and relevant environment variables" + echo " --output-prefix=

Save the output to files of the form" + echo " P-N.log, P-N.out and P-N.err where P is the prefix" + echo " and N is the queue index or mpi rank (no spaces)" + echo " --output-mode= How console output should be handled." + echo " Options are all, rank0, and none. Default: rank0" + echo " --lstopo Show bindings in lstopo" + echo " -v|--verbose Print bindings and relevant environment variables" echo " -h|--help Show this message" echo "" echo "Sample Usage:" echo " Split the current process cpuset into 4 and use the 3rd partition" echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..." - echo " Bing the process to all even cores" + echo " Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus" + echo " and save the output to rank specific files" + echo " mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\" + echo " --distribute=4 -v --output-prefix=output -- command ..." + echo " Bind the process to all even cores" echo " ${cmd} --proc-bind=core:even -v -- command ..." - echo " Bind to the first 64 cores and split the current process cpuset into 4" - echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..." - echo " skip GPU 0 when mapping visible devices" + echo " Bind the the even cores of socket 0 and the odd cores of socket 1" + echo " ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..." + echo " Skip GPU 0 when mapping visible devices" echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..." echo " Display the current bindings" - echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command" + echo " ${cmd} --proc-bind=numa:0 -- command" echo " Display the current bindings using lstopo" echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo" echo "" @@ -144,7 +164,7 @@ fi declare -a UNKNOWN_ARGS=() declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC} declare -i HPCBIND_DISTRIBUTE=1 -declare -i HPCBIND_PARTITION=0 +declare -i HPCBIND_PARTITION=-1 HPCBIND_PROC_BIND="all" HPCBIND_OPENMP_VERSION=4.0 declare -i HPCBIND_OPENMP_PERCENT=100 @@ -155,11 +175,15 @@ HPCBIND_OPENMP_FORCE_PROC_BIND="" HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true} declare -i HPCBIND_VERBOSE=0 -declare -i HPCBIND_SHOW_BINDINGS=0 declare -i HPCBIND_LSTOPO=0 -for i in $@; do - case $i in +HPCBIND_OUTPUT_PREFIX="" +HPCBIND_OUTPUT_MODE="rank0" + +declare -i HPCBIND_HAS_COMMAND=0 + +for i in "$@"; do + case "$i" in # number of partitions to create --no-hwloc-bind) HPCBIND_ENABLE_HWLOC_BIND=0 @@ -169,6 +193,10 @@ for i in $@; do HPCBIND_PROC_BIND="${i#*=}" shift ;; + --whole-system) + HPCBIND_HWLOC_PARENT_CPUSET="" + shift + ;; --distribute=*) HPCBIND_DISTRIBUTE="${i#*=}" shift @@ -182,8 +210,8 @@ for i in $@; do HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ') shift ;; - --gpu-ignore-queue) - HPCBIND_QUEUE_GPU_MAPPING=0 + --ignore-queue) + HPCBIND_QUEUE_MAPPING=0 shift ;; --no-gpu-mapping) @@ -218,14 +246,18 @@ for i in $@; do HPCBIND_OPENMP_NESTED="false" shift ;; - --show-bindings) - HPCBIND_VERBOSE=1 - HPCBIND_SHOW_BINDINGS=1 + --output-prefix=*) + HPCBIND_OUTPUT_PREFIX="${i#*=}" + shift + ;; + --output-mode=*) + HPCBIND_OUTPUT_MODE="${i#*=}" + #convert to lower case + HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}" shift ;; --lstopo) HPCBIND_VERBOSE=1 - HPCBIND_SHOW_BINDINGS=0 HPCBIND_LSTOPO=1 shift ;; @@ -239,6 +271,7 @@ for i in $@; do ;; # ignore remaining arguments --) + HPCBIND_HAS_COMMAND=1 shift break ;; @@ -250,16 +283,41 @@ for i in $@; do esac done +################################################################################ +# Check output mode +################################################################################ +declare -i HPCBIND_TEE=0 + +if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then + HPCBIND_TEE=0 +elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then + HPCBIND_TEE=1 +elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then + #default to rank0 printing to screen + HPCBIND_TEE=1 +fi + + +if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then + HPCBIND_LOG=/dev/null + HPCBIND_ERR=/dev/null + HPCBIND_OUT=/dev/null +else + HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log" + HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err" + HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out" + > ${HPCBIND_LOG} +fi + ################################################################################ # Check unknown arguments ################################################################################ if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then - echo "Uknown options: ${UNKNOWN_ARGS[*]}" + echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG}) exit 1 fi - ################################################################################ # Check that visible gpus are valid ################################################################################ @@ -268,22 +326,19 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} || ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then - echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0" + echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG}) HPCBIND_VISIBLE_GPUS[$i]=0; fi done NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]} fi - ################################################################################ # Check OpenMP percent ################################################################################ if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then - echo "OpenMP percent < 1, setting to 1" HPCBIND_OPENMP_PERCENT=1 elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then - echo "OpenMP percent > 100, setting to 100" HPCBIND_OPENMP_PERCENT=100 fi @@ -291,15 +346,21 @@ fi # Check distribute ################################################################################ if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then - echo "Invalid input for distribute, changing distribute to 1" HPCBIND_DISTRIBUTE=1 fi -if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then - echo "Invalid input for distribute-partition, changing to 0" +################################################################################ +#choose the correct partition +################################################################################ +if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then + HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX} +elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then HPCBIND_PARTITION=0 fi +if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then + HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE)) +fi ################################################################################ # Find cpuset and num threads @@ -309,13 +370,17 @@ declare -i HPCBIND_NUM_PUS=0 if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then - BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND}) + BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]}) else - BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND}) + BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]}) fi - CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE})) - HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]} + if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then + CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE})) + HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}" + else + HPCBIND_HWLOC_CPUSET="${BINDING}" + fi HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l) else HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor) @@ -373,13 +438,13 @@ export OMP_NESTED=${HPCBIND_OPENMP_NESTED} ################################################################################ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then - if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then + if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS)) - export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]} + export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" else declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION)) declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS)) - export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]} + export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" fi fi @@ -389,22 +454,22 @@ fi export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC} export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA} export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS} -export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET} +export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}" export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE} export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION} if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then export HPCBIND_HWLOC_PARENT_CPUSET="all" else - export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET} + export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}" fi -export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND} +export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}" export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING} export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',') -export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION} +export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}" if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX} - export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME} - export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING} + export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}" + export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING} fi @@ -412,43 +477,63 @@ fi # Print verbose ################################################################################ -if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then - MY_ENV=$(env | sort) - echo "[HPCBIND]" - echo "${MY_ENV}" | grep -E "^HPCBIND_" - echo "[CUDA]" - echo "${MY_ENV}" | grep -E "^CUDA_" - echo "[OPENMP]" - echo "${MY_ENV}" | grep -E "^OMP_" -fi +TMP_ENV=$(env | sort) +if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then + echo "[HOST]" >> ${HPCBIND_LOG} + hostname -s >> ${HPCBIND_LOG} + echo "[HPCBIND]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG} + echo "[CUDA]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG} + echo "[OPENMP]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG} -if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then - echo "[BINDINGS]" - hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu -elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then - echo "Unable to show bindings, hwloc not available." + if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + echo "[BINDINGS]" >> ${HPCBIND_LOG} + hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG} + else + echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG} + fi +else + echo "[HOST]" > >(tee -a ${HPCBIND_LOG}) + hostname -s > >(tee -a ${HPCBIND_LOG}) + echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG}) + echo "[CUDA]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG}) + echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG}) + + if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG}) + hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG}) + else + echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG}) + fi fi ################################################################################ # Run command ################################################################################ -if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then - if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then - hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@ - else - eval $@ - fi -else - if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then - if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then - echo "[BINDINGS]" - hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu - hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0 +# must be the last executed command so that the return value is correct +if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0 +elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then + # clear output files + > ${HPCBIND_ERR} + > ${HPCBIND_OUT} + if [[ ${HPCBIND_TEE} -eq 0 ]]; then + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} else - hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} + eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} fi else - echo "Unable to show bindings, hwloc not available." + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) + else + eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) + fi fi fi diff --git a/lib/kokkos/bin/kokkos-bind b/lib/kokkos/bin/kokkos-bind deleted file mode 100755 index b6fe07a1bd..0000000000 --- a/lib/kokkos/bin/kokkos-bind +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env bash - -# check if hwloc commands exist -declare -i HAS_HWLOC=0 -type hwloc-bind >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-distrib >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-ls >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-calc >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-ps >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - - -#parse args -declare -a UNKNOWN_ARGS=() -declare -i DISTRIBUTE=1 -declare -i INDEX=0 -PROC_BIND="all" -CURRENT_CPUSET="" -OPENMP_VERSION=4.0 -OPENMP_PROC_BIND=True -OPENMP_NESTED=True -VERBOSE=False - -#get the current process cpuset -if [[ ${HAS_HWLOC} -eq 0 ]]; then - MY_PID="$BASHPID" - CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2) - echo "$CURRENT_CPUSET" -fi - -function show_help { - local cmd=$(basename "$0") - echo "Usage: ${cmd} -- command ..." - echo " Uses hwloc to divide the node into the given number of groups," - echo " set the appropriate OMP_NUM_THREADS and execute the command on the" - echo " selected group." - echo "" - echo " NOTE: This command assumes it has exclusive use of the node" - echo "" - echo "Options:" - echo " --proc-bind= Set the initial process mask for the script. " - echo " LOC can be any valid location argumnet for" - echo " hwloc-calc. Defaults to the entire machine" - echo " --distribute=N Distribute the current proc-bind into N groups" - echo " --index=I Use the i'th group (zero based)" - echo " --openmp=M.m Set env variables for the given OpenMP version" - echo " (default 4.0)" - echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES" - echo " --no-openmp-nested Set OMP_NESTED to false" - echo " -v|--verbose" - echo " -h|--help" - echo "" - echo "Sample Usage:" - echo " ${cmd} --distribute=4 --index=2 -v -- command ..." - echo "" -} - -if [[ "$#" -eq 0 ]]; then - show_help - exit 0 -fi - - -for i in $@; do - case $i in - # number of partitions to create - --proc-bind=*) - PROC_BIND="${i#*=}" - shift - ;; - --distribute=*) - DISTRIBUTE="${i#*=}" - shift - ;; - # which group to use - --index=*) - INDEX="${i#*=}" - shift - ;; - --openmp=*) - OPENMP_VERSION="${i#*=}" - shift - ;; - --no-openmp-proc-bind) - OPENMP_PROC_BIND=False - shift - ;; - --no-openmp-nested) - OPENMP_NESTED=False - shift - ;; - -v|--verbose) - VERBOSE=True - shift - ;; - -h|--help) - show_help - exit 0 - ;; - # ignore remaining arguments - --) - shift - break - ;; - # unknown option - *) - UNKNOWN_ARGS+=("$i") - shift - ;; - esac -done - -if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then - echo "Uknown options: ${UNKNOWN_ARGS[*]}" - exit 1 -fi - -if [[ ${DISTRIBUTE} -le 0 ]]; then - echo "Invalid input for distribute, changing distribute to 1" - DISTRIBUTE=1 -fi - -if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then - echo "Invalid input for index, changing index to 0" - INDEX=0 -fi - -if [[ ${HAS_HWLOC} -ne 0 ]]; then - echo "hwloc not found, no process binding will occur" - DISTRIBUTE=1 - INDEX=0 -fi - -if [[ ${HAS_HWLOC} -eq 0 ]]; then - - if [[ "${CURRENT_CPUSET}" == "" ]]; then - BINDING=$(hwloc-calc ${PROC_BIND}) - else - BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND}) - fi - - CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE})) - CPUSET=${CPUSETS[${INDEX}]} - NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l) - - if [[ "${VERBOSE}" == "True" ]]; then - echo "hwloc: true" - echo " proc_bind: ${PROC_BIND}" - echo " distribute: ${DISTRIBUTE}" - echo " index: ${INDEX}" - echo " parent_cpuset: ${CURRENT_CPUSET}" - echo " cpuset: ${CPUSET}" - echo "omp_num_threads: ${NUM_THREADS}" - echo "omp_proc_bind: ${OPENMP_PROC_BIND}" - echo "omp_nested: ${OPENMP_NESTED}" - echo "OpenMP: ${OPENMP_VERSION}" - fi - - # set OMP env - if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then - if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then - export OMP_PLACES="threads" - export OMP_PROC_BIND="spread" - else - export OMP_PROC_BIND="true" - unset OMP_PLACES - fi - else - unset OMP_PLACES - unset OMP_PROC_BIND - fi - if [[ "${OPENMP_NESTED}" == "True" ]]; then - export OMP_NESTED="true" - else - export OMP_NESTED="false" - fi - export OMP_NUM_THREADS="${NUM_THREADS}" - - hwloc-bind ${CPUSET} -- $@ -else - NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor) - - if [[ "${VERBOSE}" == "True" ]]; then - echo "hwloc: false" - echo "omp_num_threads: ${NUM_THREADS}" - echo "omp_proc_bind: ${OPENMP_PROC_BIND}" - echo "omp_nested: ${OPENMP_NESTED}" - echo "OpenMP: ${OPENMP_VERSION}" - fi - - # set OMP env - if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then - if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then - export OMP_PLACES="threads" - export OMP_PROC_BIND="spread" - else - export OMP_PROC_BIND="true" - unset OMP_PLACES - fi - else - unset OMP_PLACES - unset OMP_PROC_BIND - fi - if [[ "${OPENMP_NESTED}" == "True" ]]; then - export OMP_NESTED="true" - else - export OMP_NESTED="false" - fi - export OMP_NUM_THREADS="${NUM_THREADS}" - - eval $@ -fi - diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index 09fa5d500a..76e33f3c66 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -78,6 +78,9 @@ temp_dir=${TMPDIR:-/tmp} # Check if we have an optimization argument already optimization_applied=0 +# Check if we have -std=c++X or --std=c++X already +stdcxx_applied=0 + #echo "Arguments: $# $@" while [ $# -gt 0 ] @@ -130,10 +133,16 @@ do cuda_args="$cuda_args $1 $2" shift ;; - #Handle c++11 setting - --std=c++11|-std=c++11) - shared_args="$shared_args $1" + #Handle c++11 + --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z) + if [ $stdcxx_applied -eq 1 ]; then + echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting" + else + shared_args="$shared_args $1" + stdcxx_applied=1 + fi ;; + #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98 -std=c++98|--std=c++98) ;; diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt index 96b05c02e1..6f9ca897d9 100644 --- a/lib/kokkos/config/master_history.txt +++ b/lib/kokkos/config/master_history.txt @@ -9,3 +9,4 @@ tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641 tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186 tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a tag: 2.04.00 date: 08:16:2017 master: 54eb75c0 develop: 32fb8ee1 +tag: 2.04.04 date: 09:11:2017 master: 2b7e9c20 develop: 51e7b25a diff --git a/lib/kokkos/config/trilinos-integration/checkin-test b/lib/kokkos/config/trilinos-integration/checkin-test index 92a1b1c068..ffb565fcbb 100644 --- a/lib/kokkos/config/trilinos-integration/checkin-test +++ b/lib/kokkos/config/trilinos-integration/checkin-test @@ -1,4 +1,4 @@ module purge -module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.59.0/base +module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu #Run Trilinos CheckinTest diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp index 0408472c68..996b6b5610 100644 --- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp +++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -125,6 +125,123 @@ namespace Impl { }; } +/// \class GraphRowViewConst +/// \brief View of a row of a sparse graph. +/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph. +/// +/// This class provides a generic view of a row of a sparse graph. +/// We intended this class to view a row of a StaticCrsGraph, but +/// GraphType need not necessarily be CrsMatrix. +/// +/// The row view is suited for computational kernels like sparse +/// matrix-vector multiply, as well as for modifying entries in the +/// sparse matrix. The view is always const as it does not allow graph modification. +/// +/// Here is an example loop over the entries in the row: +/// \code +/// typedef typename GraphRowViewConst::ordinal_type ordinal_type; +/// +/// GraphRowView G_i = ...; +/// const ordinal_type numEntries = G_i.length; +/// for (ordinal_type k = 0; k < numEntries; ++k) { +/// ordinal_type j = G_i.colidx (k); +/// // ... do something with A_ij and j ... +/// } +/// \endcode +/// +/// GraphType must provide the \c data_type +/// typedefs. In addition, it must make sense to use GraphRowViewConst to +/// view a row of GraphType. In particular, column +/// indices of a row must be accessible using the entries +/// resp. colidx arrays given to the constructor of this +/// class, with a constant stride between successive entries. +/// The stride is one for the compressed sparse row storage format (as +/// is used by CrsMatrix), but may be greater than one for other +/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal). +template +struct GraphRowViewConst { + //! The type of the column indices in the row. + typedef const typename GraphType::data_type ordinal_type; + +private: + //! Array of (local) column indices in the row. + ordinal_type* colidx_; + /// \brief Stride between successive entries in the row. + /// + /// For compressed sparse row (CSR) storage, this is always one. + /// This might be greater than one for storage formats like ELLPACK + /// or Jagged Diagonal. Nevertheless, the stride can never be + /// greater than the number of rows or columns in the matrix. Thus, + /// \c ordinal_type is the correct type. + const ordinal_type stride_; + +public: + /// \brief Constructor + /// + /// \param values [in] Array of the row's values. + /// \param colidx [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + KOKKOS_INLINE_FUNCTION + GraphRowViewConst ( ordinal_type* const colidx_in, + const ordinal_type& stride, + const ordinal_type& count) : + colidx_ (colidx_in), stride_ (stride), length (count) + {} + + /// \brief Constructor with offset into \c colidx array + /// + /// \param colidx [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + /// \param idx [in] Start offset into \c colidx array + /// + /// \tparam OffsetType The type of \c idx (see above). Must be a + /// built-in integer type. This may differ from ordinal_type. + /// For example, the matrix may have dimensions that fit in int, + /// but a number of entries that does not fit in int. + template + KOKKOS_INLINE_FUNCTION + GraphRowViewConst ( const typename GraphType::entries_type& colidx_in, + const ordinal_type& stride, + const ordinal_type& count, + const OffsetType& idx, + const typename std::enable_if::value, int>::type& = 0) : + colidx_ (&colidx_in(idx)), stride_ (stride), length (count) + {} + + /// \brief Number of entries in the row. + /// + /// This is a public const field rather than a public const method, + /// in order to avoid possible overhead of a method call if the + /// compiler is unable to inline that method call. + /// + /// We assume that rows contain no duplicate entries (i.e., entries + /// with the same column index). Thus, a row may have up to + /// A.numCols() entries. This means that the correct type of + /// 'length' is ordinal_type. + const ordinal_type length; + + /// \brief (Const) reference to the column index of entry i in this + /// row of the sparse matrix. + /// + /// "Entry i" is not necessarily the entry with column index i, nor + /// does i necessarily correspond to the (local) row index. + KOKKOS_INLINE_FUNCTION + ordinal_type& colidx (const ordinal_type& i) const { + return colidx_[i*stride_]; + } + + /// \brief An alias for colidx + KOKKOS_INLINE_FUNCTION + ordinal_type& operator()(const ordinal_type& i) const { + return colidx(i); + } +}; + + /// \class StaticCrsGraph /// \brief Compressed row storage array. /// @@ -218,6 +335,38 @@ public: static_cast (0); } + /// \brief Return a const view of row i of the graph. + /// + /// If row i does not belong to the graph, return an empty view. + /// + /// The returned object \c view implements the following interface: + ///

    + ///
  • \c view.length is the number of entries in the row
  • + ///
  • \c view.colidx(k) returns a const reference to the + /// column index of the k-th entry in the row
  • + ///
+ /// k is not a column index; it just counts from 0 to + /// view.length - 1. + /// + /// Users should not rely on the return type of this method. They + /// should instead assign to 'auto'. That allows compile-time + /// polymorphism for different kinds of sparse matrix formats (e.g., + /// ELLPACK or Jagged Diagonal) that we may wish to support in the + /// future. + KOKKOS_INLINE_FUNCTION + GraphRowViewConst rowConst (const data_type i) const { + const size_type start = row_map(i); + // count is guaranteed to fit in ordinal_type, as long as no row + // has duplicate entries. + const data_type count = static_cast (row_map(i+1) - start); + + if (count == 0) { + return GraphRowViewConst (NULL, 1, 0); + } else { + return GraphRowViewConst (entries, 1, count, start); + } + } + /** \brief Create a row partitioning into a given number of blocks * balancing non-zeros + a fixed cost per row. */ diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp index 46321378d9..c184c14d07 100644 --- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp +++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp @@ -91,11 +91,11 @@ struct DeviceIterateTile<2,RP,Functor,void > // LL if (RP::inner_direction == RP::Left) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1); } @@ -106,11 +106,11 @@ struct DeviceIterateTile<2,RP,Functor,void > // LR else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { m_func(offset_0 , offset_1); } @@ -143,11 +143,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag> if (RP::inner_direction == RP::Left) { // Loop over size maxnumblocks until full range covered for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1); } @@ -157,11 +157,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag> } else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { m_func(Tag(), offset_0 , offset_1); } @@ -196,15 +196,15 @@ struct DeviceIterateTile<3,RP,Functor,void > // LL if (RP::inner_direction == RP::Left) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2); } @@ -217,15 +217,15 @@ struct DeviceIterateTile<3,RP,Functor,void > // LR else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { m_func(offset_0 , offset_1 , offset_2); } @@ -259,15 +259,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag> { if (RP::inner_direction == RP::Left) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1 , offset_2); } @@ -279,15 +279,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag> } else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { m_func(Tag(), offset_0 , offset_1 , offset_2); } @@ -340,19 +340,19 @@ struct DeviceIterateTile<4,RP,Functor,void > const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3); } @@ -378,19 +378,19 @@ struct DeviceIterateTile<4,RP,Functor,void > const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3); } @@ -442,19 +442,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag> const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3); } @@ -479,19 +479,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag> const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3); } @@ -558,23 +558,23 @@ struct DeviceIterateTile<5,RP,Functor,void > const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4); } @@ -613,23 +613,23 @@ struct DeviceIterateTile<5,RP,Functor,void > const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4); } @@ -695,23 +695,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag> const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4); } @@ -750,23 +750,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag> const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4); } @@ -845,27 +845,27 @@ struct DeviceIterateTile<6,RP,Functor,void > const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5); } @@ -917,27 +917,27 @@ struct DeviceIterateTile<6,RP,Functor,void > const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5); } @@ -1016,27 +1016,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag> const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5); } @@ -1088,27 +1088,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag> const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5); } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp index cae8ecd489..079d9f0889 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp @@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory() template< class DriverType, unsigned int maxTperB, unsigned int minBperSM > __global__ -__launch_bounds__(maxTperB, minBperSM) +//__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_constant_memory() { const DriverType & driver = @@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver ) template< class DriverType, unsigned int maxTperB, unsigned int minBperSM > __global__ -__launch_bounds__(maxTperB, minBperSM) +//__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_local_memory( const DriverType driver ) { driver(); diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index 26b47a8b74..f8355f0d06 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -242,45 +242,89 @@ public: re_ = v; } + template KOKKOS_INLINE_FUNCTION - complex& operator += (const complex& src) { + complex& + operator += (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src.re_; im_ += src.im_; return *this; } + template KOKKOS_INLINE_FUNCTION - void operator += (const volatile complex& src) volatile { + void + operator += (const volatile complex& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src.re_; im_ += src.im_; } KOKKOS_INLINE_FUNCTION - complex& operator += (const RealType& src) { + complex& + operator += (const std::complex& src) { + re_ += src.real(); + im_ += src.imag(); + return *this; + } + + template + KOKKOS_INLINE_FUNCTION + complex& + operator += (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src; return *this; } + template KOKKOS_INLINE_FUNCTION - void operator += (const volatile RealType& src) volatile { + void + operator += (const volatile InputRealType& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src; } - + + template KOKKOS_INLINE_FUNCTION - complex& operator -= (const complex& src) { + complex& + operator -= (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ -= src.re_; im_ -= src.im_; return *this; } KOKKOS_INLINE_FUNCTION - complex& operator -= (const RealType& src) { + complex& + operator -= (const std::complex& src) { + re_ -= src.real(); + im_ -= src.imag(); + return *this; + } + + template + KOKKOS_INLINE_FUNCTION + complex& + operator -= (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ -= src; return *this; } + template KOKKOS_INLINE_FUNCTION - complex& operator *= (const complex& src) { + complex& + operator *= (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); const RealType realPart = re_ * src.re_ - im_ * src.im_; const RealType imagPart = re_ * src.im_ + im_ * src.re_; re_ = realPart; @@ -288,8 +332,12 @@ public: return *this; } + template KOKKOS_INLINE_FUNCTION - void operator *= (const volatile complex& src) volatile { + void + operator *= (const volatile complex& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); const RealType realPart = re_ * src.re_ - im_ * src.im_; const RealType imagPart = re_ * src.im_ + im_ * src.re_; re_ = realPart; @@ -297,20 +345,70 @@ public: } KOKKOS_INLINE_FUNCTION - complex& operator *= (const RealType& src) { + complex& + operator *= (const std::complex& src) { + const RealType realPart = re_ * src.real() - im_ * src.imag(); + const RealType imagPart = re_ * src.imag() + im_ * src.real(); + re_ = realPart; + im_ = imagPart; + return *this; + } + + template + KOKKOS_INLINE_FUNCTION + complex& + operator *= (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ *= src; im_ *= src; return *this; } + template KOKKOS_INLINE_FUNCTION - void operator *= (const volatile RealType& src) volatile { + void + operator *= (const volatile InputRealType& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ *= src; im_ *= src; } + template KOKKOS_INLINE_FUNCTION - complex& operator /= (const complex& y) { + complex& + operator /= (const complex& y) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + // Scale (by the "1-norm" of y) to avoid unwarranted overflow. + // If the real part is +/-Inf and the imaginary part is -/+Inf, + // this won't change the result. + const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ()); + + // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. + // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, + // because y/s is NaN. + if (s == 0.0) { + this->re_ /= s; + this->im_ /= s; + } + else { + const complex x_scaled (this->re_ / s, this->im_ / s); + const complex y_conj_scaled (y.re_ / s, -(y.im_) / s); + const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ + + y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y)) + *this = x_scaled * y_conj_scaled; + *this /= y_scaled_abs; + } + return *this; + } + + KOKKOS_INLINE_FUNCTION + complex& + operator /= (const std::complex& y) { + // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. @@ -334,57 +432,95 @@ public: return *this; } + + template KOKKOS_INLINE_FUNCTION - complex& operator /= (const RealType& src) { + complex& + operator /= (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + re_ /= src; im_ /= src; return *this; } + template KOKKOS_INLINE_FUNCTION - bool operator == (const complex& src) { - return (re_ == src.re_) && (im_ == src.im_); + bool + operator == (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + return (re_ == static_cast(src.re_)) && (im_ == static_cast(src.im_)); } KOKKOS_INLINE_FUNCTION - bool operator == (const RealType src) { - return (re_ == src) && (im_ == RealType(0)); + bool + operator == (const std::complex& src) { + return (re_ == src.real()) && (im_ == src.imag()); + } + + template + KOKKOS_INLINE_FUNCTION + bool + operator == (const InputRealType src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + return (re_ == static_cast(src)) && (im_ == RealType(0)); + } + + template + KOKKOS_INLINE_FUNCTION + bool + operator != (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + return (re_ != static_cast(src.re_)) || (im_ != static_cast(src.im_)); } KOKKOS_INLINE_FUNCTION - bool operator != (const complex& src) { - return (re_ != src.re_) || (im_ != src.im_); + bool + operator != (const std::complex& src) { + return (re_ != src.real()) || (im_ != src.imag()); } + template KOKKOS_INLINE_FUNCTION - bool operator != (const RealType src) { - return (re_ != src) || (im_ != RealType(0)); - } + bool + operator != (const InputRealType src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + return (re_ != static_cast(src)) || (im_ != RealType(0)); + } + }; //! Binary + operator for complex complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator + (const complex& x, const complex& y) { - return complex (x.real () + y.real (), x.imag () + y.imag ()); +complex::type> +operator + (const complex& x, const complex& y) { + return complex::type > (x.real () + y.real (), x.imag () + y.imag ()); } //! Binary + operator for complex scalar. -template +template KOKKOS_INLINE_FUNCTION -complex -operator + (const complex& x, const RealType& y) { - return complex (x.real () + y , x.imag ()); +complex::type> +operator + (const complex& x, const RealType2& y) { + return complex::type> (x.real () + y , x.imag ()); } //! Binary + operator for scalar complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator + (const RealType& x, const complex& y) { - return complex (x + y.real (), y.imag ()); +complex::type> +operator + (const RealType1& x, const complex& y) { + return complex::type> (x + y.real (), y.imag ()); } //! Unary + operator for complex. @@ -396,27 +532,27 @@ operator + (const complex& x) { } //! Binary - operator for complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator - (const complex& x, const complex& y) { - return complex (x.real () - y.real (), x.imag () - y.imag ()); +complex::type> +operator - (const complex& x, const complex& y) { + return complex::type> (x.real () - y.real (), x.imag () - y.imag ()); } //! Binary - operator for complex scalar. -template +template KOKKOS_INLINE_FUNCTION -complex -operator - (const complex& x, const RealType& y) { - return complex (x.real () - y , x.imag ()); +complex::type> +operator - (const complex& x, const RealType2& y) { + return complex::type> (x.real () - y , x.imag ()); } //! Binary - operator for scalar complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator - (const RealType& x, const complex& y) { - return complex (x - y.real (), - y.imag ()); +complex::type> +operator - (const RealType1& x, const complex& y) { + return complex::type> (x - y.real (), - y.imag ()); } //! Unary - operator for complex. @@ -428,12 +564,12 @@ operator - (const complex& x) { } //! Binary * operator for complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator * (const complex& x, const complex& y) { - return complex (x.real () * y.real () - x.imag () * y.imag (), - x.real () * y.imag () + x.imag () * y.real ()); +complex::type> +operator * (const complex& x, const complex& y) { + return complex::type> (x.real () * y.real () - x.imag () * y.imag (), + x.real () * y.imag () + x.imag () * y.real ()); } /// \brief Binary * operator for std::complex and complex. @@ -446,33 +582,34 @@ operator * (const complex& x, const complex& y) { /// This function cannot be called in a CUDA device function, because /// std::complex's methods and nonmember functions are not marked as /// CUDA device functions. -template -complex -operator * (const std::complex& x, const complex& y) { - return complex (x.real () * y.real () - x.imag () * y.imag (), - x.real () * y.imag () + x.imag () * y.real ()); +template +inline +complex::type> +operator * (const std::complex& x, const complex& y) { + return complex::type> (x.real () * y.real () - x.imag () * y.imag (), + x.real () * y.imag () + x.imag () * y.real ()); } /// \brief Binary * operator for RealType times complex. /// /// This function exists because the compiler doesn't know that /// RealType and complex commute with respect to operator*. -template +template KOKKOS_INLINE_FUNCTION -complex -operator * (const RealType& x, const complex& y) { - return complex (x * y.real (), x * y.imag ()); +complex::type> +operator * (const RealType1& x, const complex& y) { + return complex::type> (x * y.real (), x * y.imag ()); } /// \brief Binary * operator for RealType times complex. /// /// This function exists because the compiler doesn't know that /// RealType and complex commute with respect to operator*. -template +template KOKKOS_INLINE_FUNCTION -complex -operator * (const complex& y, const RealType& x) { - return complex (x * y.real (), x * y.imag ()); +complex::type> +operator * (const complex& y, const RealType2& x) { + return complex::type> (x * y.real (), x * y.imag ()); } //! Imaginary part of a complex number. @@ -539,33 +676,34 @@ complex pow (const complex& x) { //! Binary operator / for complex and real numbers template KOKKOS_INLINE_FUNCTION -complex +complex::type> operator / (const complex& x, const RealType2& y) { - return complex (real (x) / y, imag (x) / y); + return complex::type> (real (x) / y, imag (x) / y); } //! Binary operator / for complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator / (const complex& x, const complex& y) { +complex::type> +operator / (const complex& x, const complex& y) { // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. - const RealType s = std::fabs (real (y)) + std::fabs (imag (y)); + typedef typename std::common_type::type common_real_type; + const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y)); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, // because y/s is NaN. if (s == 0.0) { - return complex (real (x) / s, imag (x) / s); + return complex (real (x) / s, imag (x) / s); } else { - const complex x_scaled (real (x) / s, imag (x) / s); - const complex y_conj_scaled (real (y) / s, -imag (y) / s); - const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) + + const complex x_scaled (real (x) / s, imag (x) / s); + const complex y_conj_scaled (real (y) / s, -imag (y) / s); + const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) + imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y)) - complex result = x_scaled * y_conj_scaled; + complex result = x_scaled * y_conj_scaled; result /= y_scaled_abs; return result; } @@ -574,16 +712,19 @@ operator / (const complex& x, const complex& y) { //! Binary operator / for complex and real numbers template KOKKOS_INLINE_FUNCTION -complex +complex::type> operator / (const RealType1& x, const complex& y) { - return complex (x)/y; + return complex::type> (x)/y; } //! Equality operator for two complex numbers. -template +template KOKKOS_INLINE_FUNCTION -bool operator == (const complex& x, const complex& y) { - return real (x) == real (y) && imag (x) == imag (y); +bool +operator == (const complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) == static_cast(real (y)) && + static_cast(imag (x)) == static_cast(imag (y)) ); } /// \brief Equality operator for std::complex and Kokkos::complex. @@ -592,50 +733,68 @@ bool operator == (const complex& x, const complex& y) { /// Otherwise, CUDA builds will give compiler warnings ("warning: /// calling a constexpr __host__ function("real") from a __host__ /// __device__ function("operator==") is not allowed"). -template -bool operator == (const std::complex& x, const complex& y) { - return std::real (x) == real (y) && std::imag (x) == imag (y); +template +inline +bool +operator == (const std::complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(std::real (x)) == static_cast(real (y)) && + static_cast(std::imag (x)) == static_cast(imag (y)) ); } - + //! Equality operator for complex and real number. template KOKKOS_INLINE_FUNCTION -bool operator == (const complex& x, const RealType2& y) { - return real (x) == y && imag (x) == static_cast (0.0); +bool +operator == (const complex& x, const RealType2& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) == static_cast(y) && + static_cast(imag (x)) == static_cast(0.0) ); } //! Equality operator for real and complex number. -template +template KOKKOS_INLINE_FUNCTION -bool operator == (const RealType& x, const complex& y) { +bool +operator == (const RealType1& x, const complex& y) { return y == x; } //! Inequality operator for two complex numbers. -template +template KOKKOS_INLINE_FUNCTION -bool operator != (const complex& x, const complex& y) { - return real (x) != real (y) || imag (x) != imag (y); +bool +operator != (const complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) != static_cast(real (y)) || + static_cast(imag (x)) != static_cast(imag (y)) ); } //! Inequality operator for std::complex and Kokkos::complex. -template -KOKKOS_INLINE_FUNCTION -bool operator != (const std::complex& x, const complex& y) { - return std::real (x) != real (y) || std::imag (x) != imag (y); +template +inline +bool +operator != (const std::complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(std::real (x)) != static_cast(real (y)) || + static_cast(std::imag (x)) != static_cast(imag (y)) ); } //! Inequality operator for complex and real number. template KOKKOS_INLINE_FUNCTION -bool operator != (const complex& x, const RealType2& y) { - return real (x) != y || imag (x) != static_cast (0.0); +bool +operator != (const complex& x, const RealType2& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) != static_cast(y) || + static_cast(imag (x)) != static_cast(0.0) ); } //! Inequality operator for real and complex number. -template +template KOKKOS_INLINE_FUNCTION -bool operator != (const RealType& x, const complex& y) { +bool +operator != (const RealType1& x, const complex& y) { return y != x; } diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index f089c16ad2..b9c131cd7a 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -353,7 +353,14 @@ struct CountAndFill { struct Fill {}; KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const { auto j = m_crs.row_map(i); - data_type* fill = &(m_crs.entries(j)); + /* we don't want to access entries(entries.size()), even if its just to get its + address and never use it. + this can happen when row (i) is empty and all rows after it are also empty. + we could compare to row_map(i + 1), but that is a read from global memory, + whereas dimension_0() should be part of the View in registers (or constant memory) */ + data_type* fill = + (j == static_cast(m_crs.entries.dimension_0())) ? + nullptr : (&(m_crs.entries(j))); m_functor(i, fill); } using self_type = CountAndFill; diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp index 9c9af0dd8b..b811751a2c 100644 --- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp @@ -147,12 +147,11 @@ public: , const size_t arg_alloc_size ) const; /**\brief Return Name of the MemorySpace */ - static constexpr const char* name(); + static constexpr const char* name() { return "HBW"; } private: AllocationMechanism m_alloc_mech; - static constexpr const char* m_name = "HBW"; friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >; }; diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp index 339571941d..a825fd54d3 100644 --- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp +++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp @@ -192,7 +192,7 @@ template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() {return static_cast(0.0f);} KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast(1.0f);} - KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return FLT_MIN;} + KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return -FLT_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() {return FLT_MAX;} }; @@ -200,7 +200,7 @@ template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum() {return static_cast(0.0);} KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast(1.0);} - KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return DBL_MIN;} + KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return -DBL_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() {return DBL_MAX;} }; @@ -208,7 +208,7 @@ template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() {return static_cast(0.0);} KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast(1.0);} - KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return LDBL_MIN;} + KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return -LDBL_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() {return LDBL_MAX;} }; diff --git a/lib/kokkos/core/src/Kokkos_ROCm.hpp b/lib/kokkos/core/src/Kokkos_ROCm.hpp index b13b0b01de..0118d4667e 100644 --- a/lib/kokkos/core/src/Kokkos_ROCm.hpp +++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp @@ -211,6 +211,24 @@ struct VerifyExecutionCanAccessMemorySpace } // namespace Kokkos + +#define threadIdx_x (hc_get_workitem_id(0)) +#define threadIdx_y (hc_get_workitem_id(1)) +#define threadIdx_z (hc_get_workitem_id(2)) + +#define blockIdx_x (hc_get_group_id(0)) +#define blockIdx_y (hc_get_group_id(1)) +#define blockIdx_z (hc_get_group_id(2)) + +#define blockDim_x (hc_get_group_size(0)) +#define blockDim_y (hc_get_group_size(1)) +#define blockDim_z (hc_get_group_size(2)) + +#define gridDim_x (hc_get_num_groups(0)) +#define gridDim_y (hc_get_num_groups(1)) +#define gridDim_z (hc_get_num_groups(2)) + + #include #include diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile index 8fb13b8954..a917cf1656 100644 --- a/lib/kokkos/core/src/Makefile +++ b/lib/kokkos/core/src/Makefile @@ -88,6 +88,7 @@ build-makefile-kokkos: echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos echo "" >> Makefile.kokkos echo "#Variables used in application Makefiles" >> Makefile.kokkos + echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp index 37d2ac8318..de84f6e59f 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp @@ -211,6 +211,7 @@ void OpenMP::partition_master( F const& f , thread_local_bytes ); + omp_set_num_threads(partition_size); f( omp_get_thread_num(), omp_get_num_threads() ); Impl::t_openmp_instance->~Exec(); diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp index 0b7a1e2583..f2674e5929 100644 --- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp +++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp @@ -113,7 +113,6 @@ void reduce_enqueue( if (output_length < 1) return; - assert(output_result != nullptr); const auto td = get_tile_desc(szElements,output_length,team_size,vector_size, shared_size); // allocate host and device memory for the results from each team @@ -176,14 +175,17 @@ void reduce_enqueue( } }); - ValueInit::init(ReducerConditional::select(f, reducer), output_result); + if (output_result != nullptr) + ValueInit::init(ReducerConditional::select(f, reducer), output_result); fut.wait(); copy(result,result_cpu.data()); - for(std::size_t i=0;i result(td.num_tiles); hc::array scratch(len); - tile_for(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer buffer) [[hc]] + tile_for(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer buffer) [[hc]] { const auto local = t_idx.local[0]; const auto global = t_idx.global[0]; @@ -135,7 +135,7 @@ void scan_enqueue( ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]); copy(result_cpu.data(),result); - hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]] + hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]] { // const auto local = t_idx.local[0]; const auto global = t_idx.global[0]; diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp index 3d3029535e..c5e73c8b26 100644 --- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp @@ -68,6 +68,8 @@ int bit_first_zero( unsigned i ) noexcept return full != i ? _bit_scan_forward( ~i ) : -1 ; #elif defined( KOKKOS_COMPILER_IBM ) return full != i ? __cnttz4( ~i ) : -1 ; +#elif defined( KOKKOS_COMPILER_CRAYC ) + return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ; #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ ) return full != i ? __builtin_ffs( ~i ) - 1 : -1 ; #else @@ -90,17 +92,16 @@ int bit_scan_forward( unsigned i ) return _bit_scan_forward(i); #elif defined( KOKKOS_COMPILER_IBM ) return __cnttz4(i); +#elif defined( KOKKOS_COMPILER_CRAYC ) + return i ? _popcnt(~i & (i-1)) : -1; #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ ) return __builtin_ffs(i) - 1; #else - unsigned t = 1u; - int r = 0; - while ( i && ( i & t == 0 ) ) - { - t = t << 1; - ++r; + int offset = -1; + if ( i ) { + for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset ); } - return r; + return offset; #endif } @@ -116,17 +117,16 @@ int bit_scan_reverse( unsigned i ) return _bit_scan_reverse(i); #elif defined( KOKKOS_COMPILER_IBM ) return shift - __cntlz4(i); +#elif defined( KOKKOS_COMPILER_CRAYC ) + return i ? shift - _leadz32(i) : 0 ; #elif defined( __GNUC__ ) || defined( __GNUG__ ) return shift - __builtin_clz(i); #else - unsigned t = 1u << shift; - int r = 0; - while ( i && ( i & t == 0 ) ) - { - t = t >> 1; - ++r; + int offset = 0; + if ( i ) { + for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset ); } - return r; + return offset; #endif } @@ -142,6 +142,8 @@ int bit_count( unsigned i ) return _popcnt32(i); #elif defined( KOKKOS_COMPILER_IBM ) return __popcnt4(i); +#elif defined( KOKKOS_COMPILER_CRAYC ) + return _popcnt(i); #elif defined( __GNUC__ ) || defined( __GNUG__ ) return __builtin_popcount(i); #else diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp index e11f8b6d34..cd0553218d 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp @@ -166,10 +166,6 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s } } -constexpr const char* HBWSpace::name() { - return m_name; -} - } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/unit_test/TestComplex.hpp b/lib/kokkos/core/unit_test/TestComplex.hpp index ce5537fed3..c7f681699e 100644 --- a/lib/kokkos/core/unit_test/TestComplex.hpp +++ b/lib/kokkos/core/unit_test/TestComplex.hpp @@ -114,7 +114,7 @@ struct TestComplexBasicMath { typename Kokkos::View*,ExecSpace>::HostMirror h_results; void testit () { - d_results = Kokkos::View*,ExecSpace>("TestComplexBasicMath",20); + d_results = Kokkos::View*,ExecSpace>("TestComplexBasicMath",24); h_results = Kokkos::create_mirror_view(d_results); Kokkos::parallel_for(Kokkos::RangePolicy(0,1), *this); @@ -125,6 +125,7 @@ struct TestComplexBasicMath { std::complex b(3.25,5.75); std::complex d(1.0,2.0); double c = 9.3; + int e = 2; std::complex r; r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(), r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(), r.imag()); @@ -147,6 +148,12 @@ struct TestComplexBasicMath { r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag()); r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag()); r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag()); + + r = a; + /* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(), r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(), r.imag()); + /* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(), r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(), r.imag()); + /* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(), r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(), r.imag()*e); + /* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(), r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(), r.imag()/e); } KOKKOS_INLINE_FUNCTION @@ -190,6 +197,12 @@ struct TestComplexBasicMath { d_results(17) = c-a; d_results(18) = c*a; d_results(19) = c/a; + + int e = 2; + d_results(20) = a+e; + d_results(21) = a-e; + d_results(22) = a*e; + d_results(23) = a/e; } }; diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp index f579ddf02c..fbc3a65c2f 100644 --- a/lib/kokkos/core/unit_test/TestMDRange.hpp +++ b/lib/kokkos/core/unit_test/TestMDRange.hpp @@ -286,7 +286,9 @@ struct TestMDRange_2D { // Test with reducers - scalar { typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType > range_type; - range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} ); + int s0 = 1; + int s1 = 1; + range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} ); TestMDRange_2D functor( N0, N1 ); @@ -297,7 +299,7 @@ struct TestMDRange_2D { parallel_reduce( range, functor, reducer_scalar ); - ASSERT_EQ( sum, 2 * N0 * N1 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) ); } // Test with reducers - scalar view { @@ -445,7 +447,9 @@ struct TestMDRange_2D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); + const int s0 = 1; + const int s1 = 1; + range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); TestMDRange_2D functor( N0, N1 ); parallel_for( range, functor ); @@ -454,8 +458,8 @@ struct TestMDRange_2D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) { if ( h_view( i, j ) != 3 ) { ++counter; @@ -463,7 +467,7 @@ struct TestMDRange_2D { } if ( counter != 0 ) { - printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter ); + printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); @@ -699,6 +703,7 @@ struct TestMDRange_2D { ASSERT_EQ( counter, 0 ); } + } // end test_for2 }; // MDRange_2D @@ -749,7 +754,10 @@ struct TestMDRange_3D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); TestMDRange_3D functor( N0, N1, N2 ); @@ -757,7 +765,7 @@ struct TestMDRange_3D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) ); } // Test with reducers - scalar @@ -952,7 +960,10 @@ struct TestMDRange_3D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); TestMDRange_3D functor( N0, N1, N2 ); parallel_for( range, functor ); @@ -961,9 +972,9 @@ struct TestMDRange_3D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) { if ( h_view( i, j, k ) != 3 ) { ++counter; @@ -971,7 +982,7 @@ struct TestMDRange_3D { } if ( counter != 0 ) { - printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter ); + printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); @@ -1207,7 +1218,11 @@ struct TestMDRange_4D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } ); TestMDRange_4D functor( N0, N1, N2, N3 ); @@ -1215,7 +1230,7 @@ struct TestMDRange_4D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) ); } // Test with reducers - scalar @@ -1415,7 +1430,11 @@ struct TestMDRange_4D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } ); TestMDRange_4D functor( N0, N1, N2, N3 ); parallel_for( range, functor ); @@ -1424,10 +1443,10 @@ struct TestMDRange_4D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) - for ( int l = 0; l < N3; ++l ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) + for ( int l = s3; l < N3; ++l ) { if ( h_view( i, j, k, l ) != 3 ) { ++counter; @@ -1435,7 +1454,7 @@ struct TestMDRange_4D { } if ( counter != 0 ) { - printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter); + printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter); } ASSERT_EQ( counter, 0 ); @@ -1682,7 +1701,12 @@ struct TestMDRange_5D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } ); TestMDRange_5D functor( N0, N1, N2, N3, N4 ); @@ -1690,7 +1714,7 @@ struct TestMDRange_5D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) ); } // Test with reducers - scalar @@ -1810,7 +1834,12 @@ struct TestMDRange_5D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } ); TestMDRange_5D functor( N0, N1, N2, N3, N4 ); parallel_for( range, functor ); @@ -1819,11 +1848,11 @@ struct TestMDRange_5D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) - for ( int l = 0; l < N3; ++l ) - for ( int m = 0; m < N4; ++m ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) + for ( int l = s3; l < N3; ++l ) + for ( int m = s4; m < N4; ++m ) { if ( h_view( i, j, k, l, m ) != 3 ) { ++counter; @@ -1831,7 +1860,7 @@ struct TestMDRange_5D { } if ( counter != 0 ) { - printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter ); + printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); @@ -2084,7 +2113,13 @@ struct TestMDRange_6D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + int s5 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } ); TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); @@ -2092,7 +2127,7 @@ struct TestMDRange_6D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) ); } // Test with reducers - scalar @@ -2214,7 +2249,13 @@ struct TestMDRange_6D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + int s5 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); parallel_for( range, functor ); @@ -2223,12 +2264,12 @@ struct TestMDRange_6D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) - for ( int l = 0; l < N3; ++l ) - for ( int m = 0; m < N4; ++m ) - for ( int n = 0; n < N5; ++n ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) + for ( int l = s3; l < N3; ++l ) + for ( int m = s4; m < N4; ++m ) + for ( int n = s5; n < N5; ++n ) { if ( h_view( i, j, k, l, m, n ) != 3 ) { ++counter; @@ -2236,7 +2277,7 @@ struct TestMDRange_6D { } if ( counter != 0 ) { - printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter ); + printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); From c522b1b7a974792e0a6a4285b7171a73040774c9 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 4 Oct 2017 00:22:56 -0400 Subject: [PATCH 41/53] add call to fftw_cleanup() before exiting to avoid bogus leak reports when compiling with FFTW v3.x --- src/main.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/main.cpp b/src/main.cpp index 7401183fea..82dac5af6d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,6 +22,10 @@ #include #endif +#ifdef FFT_FFTW3 +#include +#endif + using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- @@ -62,4 +66,10 @@ int main(int argc, char **argv) #endif MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); + +#ifdef FFT_FFTW3 + // tell fftw3 to delete its global memory pool + // and thus avoid bogus valgrind memory leak reports + fftw_cleanup(); +#endif } From bda0ee3aa1ebc3c652bfd83da5f9281394a8a650 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 4 Oct 2017 12:06:03 -0600 Subject: [PATCH 42/53] Destroy unneeded fix in pair_reaxc_kokkos --- src/KOKKOS/pair_reaxc_kokkos.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/KOKKOS/pair_reaxc_kokkos.cpp b/src/KOKKOS/pair_reaxc_kokkos.cpp index d95cd8f8ae..cd0ebcde05 100644 --- a/src/KOKKOS/pair_reaxc_kokkos.cpp +++ b/src/KOKKOS/pair_reaxc_kokkos.cpp @@ -131,6 +131,8 @@ template void PairReaxCKokkos::init_style() { PairReaxC::init_style(); + if (fix_reax) modify->delete_fix("REAXC"); // not needed in the Kokkos version + fix_reax = NULL; // irequest = neigh request made by parent class From 3653f4012013ac1f04840395ff3c1b821a30b4da Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 4 Oct 2017 12:10:13 -0600 Subject: [PATCH 43/53] Reduce unnecessary communication in fix_qeq_reax --- src/KOKKOS/fix_qeq_reax_kokkos.cpp | 15 ++++++++------- src/USER-REAXC/fix_qeq_reax.cpp | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp index e54b53ae89..5a1d3c7f1c 100644 --- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp @@ -63,6 +63,7 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) : nmax = nmax = m_cap = 0; allocated_flag = 0; + nprev = 4; } /* ---------------------------------------------------------------------- */ @@ -158,15 +159,15 @@ void FixQEqReaxKokkos::init_hist() { int i,j; - k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,5); + k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,nprev); d_s_hist = k_s_hist.template view(); h_s_hist = k_s_hist.h_view; - k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,5); + k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,nprev); d_t_hist = k_t_hist.template view(); h_t_hist = k_t_hist.h_view; for( i = 0; i < atom->nmax; i++ ) - for( j = 0; j < 5; j++ ) + for( j = 0; j < nprev; j++ ) k_s_hist.h_view(i,j) = k_t_hist.h_view(i,j) = 0.0; k_s_hist.template modify(); @@ -334,11 +335,11 @@ void FixQEqReaxKokkos::allocate_array() d_d = k_d.template view(); h_d = k_d.h_view; - k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,5); + k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,nprev); d_s_hist = k_s_hist.template view(); h_s_hist = k_s_hist.h_view; - k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,5); + k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,nprev); d_t_hist = k_t_hist.template view(); h_t_hist = k_t_hist.h_view; } @@ -368,7 +369,7 @@ void FixQEqReaxKokkos::zero_item(int ii) const d_o[i] = 0.0; d_r[i] = 0.0; d_d[i] = 0.0; - //for( int j = 0; j < 5; j++ ) + //for( int j = 0; j < nprev; j++ ) //d_s_hist(i,j) = d_t_hist(i,j) = 0.0; } @@ -1173,7 +1174,7 @@ double FixQEqReaxKokkos::memory_usage() { double bytes; - bytes = atom->nmax*5*2 * sizeof(F_FLOAT); // s_hist & t_hist + bytes = atom->nmax*nprev*2 * sizeof(F_FLOAT); // s_hist & t_hist bytes += atom->nmax*8 * sizeof(F_FLOAT); // storage bytes += n_cap*2 * sizeof(int); // matrix... bytes += m_cap * sizeof(int); diff --git a/src/USER-REAXC/fix_qeq_reax.cpp b/src/USER-REAXC/fix_qeq_reax.cpp index 9d165f3fd3..33b70c972d 100644 --- a/src/USER-REAXC/fix_qeq_reax.cpp +++ b/src/USER-REAXC/fix_qeq_reax.cpp @@ -95,7 +95,7 @@ FixQEqReax::FixQEqReax(LAMMPS *lmp, int narg, char **arg) : pack_flag = 0; s = NULL; t = NULL; - nprev = 5; + nprev = 4; Hdia_inv = NULL; b_s = NULL; From 2b0bfcb10f906873f3771193f7cb1374dfd9a39e Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 5 Oct 2017 10:35:09 -0600 Subject: [PATCH 44/53] Fix memory leak in pair_reaxc_kokkos --- src/KOKKOS/pair_reaxc_kokkos.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/KOKKOS/pair_reaxc_kokkos.cpp b/src/KOKKOS/pair_reaxc_kokkos.cpp index cd0ebcde05..d5f83f4537 100644 --- a/src/KOKKOS/pair_reaxc_kokkos.cpp +++ b/src/KOKKOS/pair_reaxc_kokkos.cpp @@ -557,8 +557,8 @@ void PairReaxCKokkos::Deallocate_Lookup_Tables() ntypes = atom->ntypes; - for( i = 0; i < ntypes; ++i ) { - for( j = i; j < ntypes; ++j ) + for( i = 0; i <= ntypes; ++i ) { + for( j = i; j <= ntypes; ++j ) if( LR[i][j].n ) { sfree( LR[i][j].y, "LR[i,j].y" ); sfree( LR[i][j].H, "LR[i,j].H" ); From eecd2fbaee77536910031b3f40b455b242c301b1 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 5 Oct 2017 11:23:31 -0600 Subject: [PATCH 45/53] Remove hardcoded value in fix_qeq_reax --- src/KOKKOS/fix_qeq_reax_kokkos.cpp | 2 +- src/USER-OMP/fix_qeq_reax_omp.cpp | 2 +- src/USER-REAXC/fix_qeq_reax.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp index 5a1d3c7f1c..5d2f6a0438 100644 --- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp @@ -1088,7 +1088,7 @@ void FixQEqReaxKokkos::calculate_q_item(int ii) const if (mask[i] & groupbit) { q(i) = d_s[i] - delta * d_t[i]; - for (int k = 4; k > 0; --k) { + for (int k = nprev-1; k > 0; --k) { d_s_hist(i,k) = d_s_hist(i,k-1); d_t_hist(i,k) = d_t_hist(i,k-1); } diff --git a/src/USER-OMP/fix_qeq_reax_omp.cpp b/src/USER-OMP/fix_qeq_reax_omp.cpp index 4457ab6592..d89c9627fe 100644 --- a/src/USER-OMP/fix_qeq_reax_omp.cpp +++ b/src/USER-OMP/fix_qeq_reax_omp.cpp @@ -703,7 +703,7 @@ void FixQEqReaxOMP::calculate_Q() q[i] = s[i] - u * t[i]; // backup s & t - for (int k = 4; k > 0; --k) { + for (int k = nprev-1; k > 0; --k) { s_hist[i][k] = s_hist[i][k-1]; t_hist[i][k] = t_hist[i][k-1]; } diff --git a/src/USER-REAXC/fix_qeq_reax.cpp b/src/USER-REAXC/fix_qeq_reax.cpp index 33b70c972d..d1c4f90771 100644 --- a/src/USER-REAXC/fix_qeq_reax.cpp +++ b/src/USER-REAXC/fix_qeq_reax.cpp @@ -817,7 +817,7 @@ void FixQEqReax::calculate_Q() q[i] = s[i] - u * t[i]; /* backup s & t */ - for (k = 4; k > 0; --k) { + for (k = nprev-1; k > 0; --k) { s_hist[i][k] = s_hist[i][k-1]; t_hist[i][k] = t_hist[i][k-1]; } From 6bf2c60c07edefc7e1843b289454ce7ecb645e0a Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 5 Oct 2017 14:58:05 -0600 Subject: [PATCH 46/53] Fix issues in Kokkos comm --- src/KOKKOS/comm_kokkos.cpp | 60 +++++++++++++++++++++++------------- src/KOKKOS/verlet_kokkos.cpp | 7 +++-- 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index d4d348d7e2..3276d0cdb0 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -135,9 +135,10 @@ void CommKokkos::init() if (force->newton == 0) check_reverse = 0; if (force->pair) check_reverse += force->pair->comm_reverse_off; - if (check_reverse || check_forward) - forward_comm_classic = true; + //if (check_forward) + // forward_comm_classic = true; + //if (check_reverse || !comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet reverse_comm_classic = true; } @@ -186,12 +187,12 @@ void CommKokkos::forward_comm_device(int dummy) // if comm_x_only set, exchange or copy directly to x, don't unpack k_sendlist.sync(); + atomKK->sync(ExecutionSpaceFromDevice::space,X_MASK); for (int iswap = 0; iswap < nswap; iswap++) { if (sendproc[iswap] != me) { if (comm_x_only) { if (size_forward_recv[iswap]) { - atomKK->sync(ExecutionSpaceFromDevice::space,X_MASK); buf = atomKK->k_x.view().ptr_on_device() + firstrecv[iswap]*atomKK->k_x.view().dimension_1(); MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, @@ -204,9 +205,11 @@ void CommKokkos::forward_comm_device(int dummy) n,MPI_DOUBLE,sendproc[iswap],0,world); } - if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); - atomKK->modified(ExecutionSpaceFromDevice:: - space,X_MASK); + if (size_forward_recv[iswap]) { + MPI_Wait(&request,MPI_STATUS_IGNORE); + atomKK->modified(ExecutionSpaceFromDevice:: + space,X_MASK); + } } else if (ghost_velocity) { error->all(FLERR,"Ghost velocity forward comm not yet " "implemented with Kokkos"); @@ -276,7 +279,7 @@ void CommKokkos::reverse_comm() else atomKK->modified(Host,ALL_MASK); - atomKK->sync(Device,ALL_MASK); // is this needed? + //atomKK->sync(Device,ALL_MASK); // is this needed? } template @@ -290,9 +293,10 @@ void CommKokkos::reverse_comm_device() // exchange data with another proc // if other proc is self, just copy // if comm_f_only set, exchange or copy directly from f, don't pack - + k_sendlist.sync(); - + atomKK->sync(ExecutionSpaceFromDevice::space,F_MASK); + for (int iswap = nswap-1; iswap >= 0; iswap--) { if (sendproc[iswap] != me) { if (comm_f_only) { @@ -300,16 +304,17 @@ void CommKokkos::reverse_comm_device() MPI_Irecv(k_buf_recv.view().ptr_on_device(),size_reverse_recv[iswap],MPI_DOUBLE, sendproc[iswap],0,world,&request); if (size_reverse_send[iswap]) { - atomKK->sync(ExecutionSpaceFromDevice::space,F_MASK); buf = atomKK->k_f.view().ptr_on_device() + firstrecv[iswap]*atomKK->k_f.view().dimension_1(); MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE, recvproc[iswap],0,world); } - if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); - atomKK->modified(ExecutionSpaceFromDevice:: - space,F_MASK); + if (size_reverse_recv[iswap]) { + MPI_Wait(&request,MPI_STATUS_IGNORE); + atomKK->modified(ExecutionSpaceFromDevice:: + space,F_MASK); + } } else { if (size_reverse_recv[iswap]) MPI_Irecv(k_buf_recv.view().ptr_on_device(), @@ -710,9 +715,7 @@ void CommKokkos::borders() } atomKK->sync(Host,ALL_MASK); - atomKK->modified(Host,ALL_MASK); k_sendlist.sync(); - k_sendlist.modify(); CommBrick::borders(); k_sendlist.modify(); atomKK->modified(Host,ALL_MASK); @@ -783,7 +786,7 @@ void CommKokkos::borders_device() { AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; ExecutionSpace exec_space = ExecutionSpaceFromDevice::space; - k_sendlist.modify(); + k_sendlist.sync(); atomKK->sync(exec_space,ALL_MASK); // do swaps over all 3 dimensions @@ -845,20 +848,24 @@ void CommKokkos::borders_device() { k_total_send.template modify(); k_total_send.template sync(); + k_sendlist.modify(); + if(k_total_send.h_view() >= maxsendlist[iswap]) { grow_list(iswap,k_total_send.h_view()); - k_sendlist.modify(); + k_total_send.h_view() = 0; - if(exec_space == Device) { - k_total_send.template modify(); - k_total_send.template sync(); - } + k_total_send.template modify(); + k_total_send.template sync(); + BuildBorderListFunctor f(atomKK->k_x,k_sendlist, k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); Kokkos::TeamPolicy config((nlast-nfirst+127)/128,128); Kokkos::parallel_for(config,f); + k_total_send.template modify(); k_total_send.template sync(); + + k_sendlist.modify(); } nsend = k_total_send.h_view(); } else { @@ -983,7 +990,6 @@ void CommKokkos::borders_device() { // reset global->local map - if (exec_space == Host) k_sendlist.sync(); atomKK->modified(exec_space,ALL_MASK); if (map_style) { atomKK->sync(Host,TAG_MASK); @@ -1057,6 +1063,11 @@ void CommKokkos::grow_list(int iswap, int n) { int size = static_cast (BUFFACTOR * n); + if (exchange_comm_classic) { // force realloc on Host + k_sendlist.sync(); + k_sendlist.modify(); + } + memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist"); for(int i=0;i(); + k_sendlist.modify(); + } + memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist"); memory->grow(maxsendlist,n,"comm:maxsendlist"); diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp index e4a3f857d3..cb9d60f9ca 100644 --- a/src/KOKKOS/verlet_kokkos.cpp +++ b/src/KOKKOS/verlet_kokkos.cpp @@ -526,8 +526,11 @@ void VerletKokkos::run(int n) // reverse communication of forces - if (force->newton) comm->reverse_comm(); - timer->stamp(Timer::COMM); + if (force->newton) { + Kokkos::fence(); + comm->reverse_comm(); + timer->stamp(Timer::COMM); + } // force modifications, final time integration, diagnostics From 44d2e8ff74d515d601b2cd330c5fb2e724016f3f Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 5 Oct 2017 15:27:00 -0600 Subject: [PATCH 47/53] Add pre_reverse to verlet_kokkos and comment out timer --- src/KOKKOS/verlet_kokkos.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp index cb9d60f9ca..adec5ff1bd 100644 --- a/src/KOKKOS/verlet_kokkos.cpp +++ b/src/KOKKOS/verlet_kokkos.cpp @@ -294,6 +294,7 @@ void VerletKokkos::run(int n) int n_pre_exchange = modify->n_pre_exchange; int n_pre_neighbor = modify->n_pre_neighbor; int n_pre_force = modify->n_pre_force; + int n_pre_reverse = modify->n_pre_reverse; int n_post_force = modify->n_post_force; int n_end_of_step = modify->n_end_of_step; @@ -304,9 +305,9 @@ void VerletKokkos::run(int n) f_merge_copy = DAT::t_f_array("VerletKokkos::f_merge_copy",atomKK->k_f.dimension_0()); - static double time = 0.0; atomKK->sync(Device,ALL_MASK); - Kokkos::Impl::Timer ktimer; + //static double time = 0.0; + //Kokkos::Impl::Timer ktimer; timer->init_timeout(); for (int i = 0; i < n; i++) { @@ -320,10 +321,10 @@ void VerletKokkos::run(int n) // initial time integration - ktimer.reset(); + //ktimer.reset(); timer->stamp(); modify->initial_integrate(vflag); - time += ktimer.seconds(); + //time += ktimer.seconds(); if (n_post_integrate) modify->post_integrate(); timer->stamp(Timer::MODIFY); @@ -523,6 +524,10 @@ void VerletKokkos::run(int n) atomKK->k_f.modify(); } + if (n_pre_reverse) { + modify->pre_reverse(eflag,vflag); + timer->stamp(Timer::MODIFY); + } // reverse communication of forces From e0efdd50fab3c2f98c22ebd58e1c63e7be4045b4 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 5 Oct 2017 15:47:46 -0600 Subject: [PATCH 48/53] Switch to classic comm if ghost_velocity. The check_forward and check_reverse tests aren't necessary because the fix/pair/etc. comm is done in a separate routine. --- src/KOKKOS/comm_kokkos.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index 3276d0cdb0..5534341342 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -135,10 +135,9 @@ void CommKokkos::init() if (force->newton == 0) check_reverse = 0; if (force->pair) check_reverse += force->pair->comm_reverse_off; - //if (check_forward) - // forward_comm_classic = true; + if (ghost_velocity) + forward_comm_classic = true; - //if (check_reverse || !comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet reverse_comm_classic = true; } From 214c0cfb2b7e83697b3e70db81b12f46eb0d7370 Mon Sep 17 00:00:00 2001 From: Steve Plimpton Date: Thu, 5 Oct 2017 16:44:24 -0600 Subject: [PATCH 49/53] add atom_modify map yes, also timers to create_atoms and replicate --- doc/src/Section_packages.txt | 2 +- doc/src/atom_modify.txt | 52 +++++++++++++++++++----------------- doc/src/fix_nh.txt | 36 ++++++++++++++----------- src/atom.cpp | 5 ++-- src/atom_map.cpp | 4 +-- src/create_atoms.cpp | 16 +++++++++-- src/output.cpp | 2 +- src/replicate.cpp | 17 ++++++++++++ 8 files changed, 86 insertions(+), 48 deletions(-) diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt index d9a9fb4163..b0b2d9fa63 100644 --- a/doc/src/Section_packages.txt +++ b/doc/src/Section_packages.txt @@ -727,7 +727,7 @@ make lib-latte # print help message make lib-latte args="-b" # download and build in lib/latte/LATTE-master make lib-latte args="-p $HOME/latte" # use existing LATTE installation in $HOME/latte make lib-latte args="-b -m gfortran" # download and build in lib/latte and - # copy Makefile.lammps.gfortran to Makefile.lammps + # copy Makefile.lammps.gfortran to Makefile.lammps :pre Note that 3 symbolic (soft) links, "includelink" and "liblink" and "filelink", are created in lib/latte to point into the LATTE home dir. diff --git a/doc/src/atom_modify.txt b/doc/src/atom_modify.txt index d5c82f16ac..1dc0fa6bfb 100644 --- a/doc/src/atom_modify.txt +++ b/doc/src/atom_modify.txt @@ -16,7 +16,7 @@ atom_modify keyword values ... :pre one or more keyword/value pairs may be appended :ulb,l keyword = {id} or {map} or {first} or {sort} :l {id} value = {yes} or {no} - {map} value = {array} or {hash} + {map} value = {yes} or {array} or {hash} {first} value = group-ID = group whose atoms will appear first in internal atom lists {sort} values = Nfreq binsize Nfreq = sort atoms spatially every this many time steps @@ -25,8 +25,8 @@ keyword = {id} or {map} or {first} or {sort} :l [Examples:] -atom_modify map hash -atom_modify map array sort 10000 2.0 +atom_modify map yes +atom_modify map hash sort 10000 2.0 atom_modify first colloid :pre [Description:] @@ -62,29 +62,33 @@ switch. This is described in "Section 2.2"_Section_start.html#start_2 of the manual. If atom IDs are not used, they must be specified as 0 for all atoms, e.g. in a data or restart file. -The {map} keyword determines how atom ID lookup is done for molecular -atom styles. Lookups are performed by bond (angle, etc) routines in -LAMMPS to find the local atom index associated with a global atom ID. +The {map} keyword determines how atoms with specific IDs are found +when required. An example are the bond (angle, etc) methods which +need to find the local index of an atom with a specific global ID +which is a bond (angle, etc) partner. LAMMPS performs this operation +efficiently by creating a "map", which is either an {array} or {hash} +table, as descibed below. -When the {array} value is used, each processor stores a lookup table -of length N, where N is the largest atom ID in the system. This is a +When the {map} keyword is not specified in your input script, LAMMPS +only creates a map for "atom_styles"_atom_style.html for molecular +systems which have permanent bonds (angles, etc). No map is created +for atomic systems, since it is normally not needed. However some +LAMMPS commands require a map, even for atomic systems, and will +generate an error if one does not exist. The {map} keyword thus +allows you to force the creation of a map. The {yes} value will +create either an {array} or {hash} style map, as explained in the next +paragraph. The {array} and {hash} values create an atom-style or +hash-style map respectively. + +For an {array}-style map, each processor stores a lookup table of +length N, where N is the largest atom ID in the system. This is a fast, simple method for many simulations, but requires too much memory -for large simulations. The {hash} value uses a hash table to perform -the lookups. This can be slightly slower than the {array} method, but -its memory cost is proportional to the number of atoms owned by a -processor, i.e. N/P when N is the total number of atoms in the system -and P is the number of processors. - -When this setting is not specified in your input script, LAMMPS -creates a map, if one is needed, as an array or hash. See the -discussion of default values below for how LAMMPS chooses which kind -of map to build. Note that atomic systems do not normally need to -create a map. However, even in this case some LAMMPS commands will -create a map to find atoms (and then destroy it), or require a -permanent map. An example of the former is the "velocity loop -all"_velocity.html command, which uses a map when looping over all -atoms and insuring the same velocity values are assigned to an atom -ID, no matter which processor owns it. +for large simulations. For a {hash}-style map, a hash table is +created on each processor, which finds an atom ID in constant time +(independent of the global number of atom IDs). It can be slightly +slower than the {array} map, but its memory cost is proportional to +the number of atoms owned by a processor, i.e. N/P when N is the total +number of atoms in the system and P is the number of processors. The {first} keyword allows a "group"_group.html to be specified whose atoms will be maintained as the first atoms in each processor's list diff --git a/doc/src/fix_nh.txt b/doc/src/fix_nh.txt index 8fa30ac222..41d0e6438f 100644 --- a/doc/src/fix_nh.txt +++ b/doc/src/fix_nh.txt @@ -393,32 +393,36 @@ thermostatting and barostatting. :line These fixes compute a temperature and pressure each timestep. To do -this, the fix creates its own computes of style "temp" and "pressure", -as if one of these two sets of commands had been issued: +this, the thermostat and barostat fixes create their own computes of +style "temp" and "pressure", as if one of these sets of commands had +been issued: +For fix nvt: compute fix-ID_temp group-ID temp -compute fix-ID_press group-ID pressure fix-ID_temp :pre +For fix npt and fix nph: compute fix-ID_temp all temp compute fix-ID_press all pressure fix-ID_temp :pre -See the "compute temp"_compute_temp.html and "compute -pressure"_compute_pressure.html commands for details. Note that the -IDs of the new computes are the fix-ID + underscore + "temp" or fix_ID -+ underscore + "press". For fix nvt, the group for the new computes -is the same as the fix group. For fix nph and fix npt, the group for -the new computes is "all" since pressure is computed for the entire -system. +For fix nvt, the group for the new temperature compute is the same as +the fix group. For fix npt and fix nph, the group for both the new +temperature and pressure compute is "all" since pressure is computed +for the entire system. In the case of fix nph, the temperature +compute is not used for thermostatting, but just for a kinetic-energy +contribution to the pressure. See the "compute +temp"_compute_temp.html and "compute pressure"_compute_pressure.html +commands for details. Note that the IDs of the new computes are the +fix-ID + underscore + "temp" or fix_ID + underscore + "press". Note that these are NOT the computes used by thermodynamic output (see the "thermo_style"_thermo_style.html command) with ID = {thermo_temp} -and {thermo_press}. This means you can change the attributes of this +and {thermo_press}. This means you can change the attributes of these fix's temperature or pressure via the -"compute_modify"_compute_modify.html command or print this temperature -or pressure during thermodynamic output via the "thermo_style -custom"_thermo_style.html command using the appropriate compute-ID. -It also means that changing attributes of {thermo_temp} or -{thermo_press} will have no effect on this fix. +"compute_modify"_compute_modify.html command. Or you can print this +temperature or pressure during thermodynamic output via the +"thermo_style custom"_thermo_style.html command using the appropriate +compute-ID. It also means that changing attributes of {thermo_temp} +or {thermo_press} will have no effect on this fix. Like other fixes that perform thermostatting, fix nvt and fix npt can be used with "compute commands"_compute.html that calculate a diff --git a/src/atom.cpp b/src/atom.cpp index 1191f0f2b5..7d343a0807 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -453,12 +453,12 @@ void Atom::create_avec(const char *style, int narg, char **arg, int trysuffix) // if molecular system: // atom IDs must be defined // force atom map to be created - // map style may be reset by map_init() and its call to map_style_set() + // map style will be reset to array vs hash to by map_init() molecular = avec->molecular; if (molecular && tag_enable == 0) error->all(FLERR,"Atom IDs must be used for molecular systems"); - if (molecular) map_style = 1; + if (molecular) map_style = 3; } /* ---------------------------------------------------------------------- @@ -593,6 +593,7 @@ void Atom::modify_params(int narg, char **arg) "Atom_modify map command after simulation box is defined"); if (strcmp(arg[iarg+1],"array") == 0) map_user = 1; else if (strcmp(arg[iarg+1],"hash") == 0) map_user = 2; + else if (strcmp(arg[iarg+1],"yes") == 0) map_user = 3; else error->all(FLERR,"Illegal atom_modify command"); map_style = map_user; iarg += 2; diff --git a/src/atom_map.cpp b/src/atom_map.cpp index bbfe014dec..9d257d99de 100644 --- a/src/atom_map.cpp +++ b/src/atom_map.cpp @@ -298,12 +298,12 @@ int Atom::map_style_set() MPI_Allreduce(&max,&map_tag_max,1,MPI_LMP_TAGINT,MPI_MAX,world); // set map_style for new map - // if user-selected, use that setting + // if user-selected to array/hash, use that setting // else if map_tag_max > 1M, use hash // else use array int map_style_old = map_style; - if (map_user) map_style = map_user; + if (map_user == 1 || map_user == 2) map_style = map_user; else if (map_tag_max > 1000000) map_style = 2; else map_style = 1; diff --git a/src/create_atoms.cpp b/src/create_atoms.cpp index 04a2df91f8..444b0c5bcd 100644 --- a/src/create_atoms.cpp +++ b/src/create_atoms.cpp @@ -343,6 +343,11 @@ void CreateAtoms::command(int narg, char **arg) } } + // CPU time + + MPI_Barrier(world); + double time1 = MPI_Wtime(); + // clear ghost count and any ghost bonus data internal to AtomVec // same logic as beginning of Comm::exchange() // do it now b/c creating atoms will overwrite ghost atoms @@ -509,6 +514,9 @@ void CreateAtoms::command(int narg, char **arg) if (domain->triclinic) domain->lamda2x(atom->nlocal); } + MPI_Barrier(world); + double time2 = MPI_Wtime(); + // clean up delete ranmol; @@ -521,12 +529,16 @@ void CreateAtoms::command(int narg, char **arg) // print status if (comm->me == 0) { - if (screen) + if (screen) { fprintf(screen,"Created " BIGINT_FORMAT " atoms\n", atom->natoms-natoms_previous); - if (logfile) + fprintf(screen," CPU time = %g secs\n",time2-time1); + } + if (logfile) { fprintf(logfile,"Created " BIGINT_FORMAT " atoms\n", atom->natoms-natoms_previous); + fprintf(logfile," CPU time = %g secs\n",time2-time1); + } } // for MOLECULE mode: diff --git a/src/output.cpp b/src/output.cpp index ce7fcb7cca..ce593ec6ae 100644 --- a/src/output.cpp +++ b/src/output.cpp @@ -827,9 +827,9 @@ void Output::create_restart(int narg, char **arg) sum and print memory usage result is only memory on proc 0, not averaged across procs ------------------------------------------------------------------------- */ + void Output::memory_usage() { - bigint bytes = 0; bytes += atom->memory_usage(); bytes += neighbor->memory_usage(); diff --git a/src/replicate.cpp b/src/replicate.cpp index e2ed718f65..9c1a271be2 100644 --- a/src/replicate.cpp +++ b/src/replicate.cpp @@ -74,6 +74,11 @@ void Replicate::command(int narg, char **arg) if (atom->nextra_grow || atom->nextra_restart || atom->nextra_store) error->all(FLERR,"Cannot replicate with fixes that store atom quantities"); + // CPU time + + MPI_Barrier(world); + double time1 = MPI_Wtime(); + // maxtag = largest atom tag across all existing atoms tagint maxtag = 0; @@ -424,4 +429,16 @@ void Replicate::command(int narg, char **arg) Special special(lmp); special.build(); } + + // CPU time + + MPI_Barrier(world); + double time2 = MPI_Wtime(); + + if (me == 0) { + if (screen) + fprintf(screen," CPU time = %g secs\n",time2-time1); + if (logfile) + fprintf(logfile," CPU time = %g secs\n",time2-time1); + } } From dc0e20947ee70a6ecaa98726a9fbe76fdab31953 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Fri, 6 Oct 2017 16:37:52 +0200 Subject: [PATCH 50/53] MAINT: Return error when 'at' keyword is used without 'append yes'. --- src/USER-NETCDF/dump_netcdf.cpp | 2 ++ src/USER-NETCDF/dump_netcdf_mpiio.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp index 7156b773b3..a9532d1077 100644 --- a/src/USER-NETCDF/dump_netcdf.cpp +++ b/src/USER-NETCDF/dump_netcdf.cpp @@ -924,6 +924,8 @@ int DumpNetCDF::modify_param(int narg, char **arg) return 2; } else if (strcmp(arg[iarg],"at") == 0) { + if (!append_flag) + error->all(FLERR,"expected 'append yes' before 'at' keyword"); iarg++; framei = force->inumeric(FLERR,arg[iarg]); if (framei < 0) framei--; diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp index 29c2b6cb1f..746b904655 100644 --- a/src/USER-NETCDF/dump_netcdf_mpiio.cpp +++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp @@ -920,6 +920,8 @@ int DumpNetCDFMPIIO::modify_param(int narg, char **arg) return 2; } else if (strcmp(arg[iarg],"at") == 0) { + if (!append_flag) + error->all(FLERR,"expected 'append yes' before 'at' keyword"); iarg++; framei = force->inumeric(FLERR,arg[iarg]); if (framei < 0) framei--; From 352a20fc1cb775a79fc23dfc3a56d05b5de664f4 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Fri, 6 Oct 2017 16:38:15 +0200 Subject: [PATCH 51/53] DOC: Updated doc to separate description of 'append' and 'at' keywords. --- doc/src/dump_modify.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/src/dump_modify.txt b/doc/src/dump_modify.txt index 6ccf40a8c5..38d9aad4d9 100644 --- a/doc/src/dump_modify.txt +++ b/doc/src/dump_modify.txt @@ -15,9 +15,11 @@ dump_modify dump-ID keyword values ... :pre dump-ID = ID of dump to modify :ulb,l one or more keyword/value pairs may be appended :l these keywords apply to various dump styles :l -keyword = {append} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l - {append} arg = {yes} or {no} or {yes at} N +keyword = {append} or {at} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l + {append} arg = {yes} or {no} + {at} arg = N N = index of frame written upon first dump + only available after "append yes" {buffer} arg = {yes} or {no} {element} args = E1 E2 ... EN, where N = # of atom types E1,...,EN = element name, e.g. C or Fe or Ga From 58e1969de2413a96b57084cf1ffb7d86e54997c0 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 6 Oct 2017 14:34:10 -0400 Subject: [PATCH 52/53] rename misleading "CPU time" into "Time spent" --- src/create_atoms.cpp | 6 +++--- src/replicate.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/create_atoms.cpp b/src/create_atoms.cpp index 444b0c5bcd..992049a81f 100644 --- a/src/create_atoms.cpp +++ b/src/create_atoms.cpp @@ -343,7 +343,7 @@ void CreateAtoms::command(int narg, char **arg) } } - // CPU time + // Record wall time for atom creation MPI_Barrier(world); double time1 = MPI_Wtime(); @@ -532,12 +532,12 @@ void CreateAtoms::command(int narg, char **arg) if (screen) { fprintf(screen,"Created " BIGINT_FORMAT " atoms\n", atom->natoms-natoms_previous); - fprintf(screen," CPU time = %g secs\n",time2-time1); + fprintf(screen," Time spent = %g secs\n",time2-time1); } if (logfile) { fprintf(logfile,"Created " BIGINT_FORMAT " atoms\n", atom->natoms-natoms_previous); - fprintf(logfile," CPU time = %g secs\n",time2-time1); + fprintf(logfile," Time spent = %g secs\n",time2-time1); } } diff --git a/src/replicate.cpp b/src/replicate.cpp index 9c1a271be2..f3d1964169 100644 --- a/src/replicate.cpp +++ b/src/replicate.cpp @@ -74,7 +74,7 @@ void Replicate::command(int narg, char **arg) if (atom->nextra_grow || atom->nextra_restart || atom->nextra_store) error->all(FLERR,"Cannot replicate with fixes that store atom quantities"); - // CPU time + // Record wall time for atom replication MPI_Barrier(world); double time1 = MPI_Wtime(); @@ -430,15 +430,15 @@ void Replicate::command(int narg, char **arg) special.build(); } - // CPU time + // Wall time MPI_Barrier(world); double time2 = MPI_Wtime(); if (me == 0) { if (screen) - fprintf(screen," CPU time = %g secs\n",time2-time1); + fprintf(screen," Time spent = %g secs\n",time2-time1); if (logfile) - fprintf(logfile," CPU time = %g secs\n",time2-time1); + fprintf(logfile," Time spent = %g secs\n",time2-time1); } } From 6820db99e22fedd7cdda297ad3897436ea04ee54 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 6 Oct 2017 14:41:38 -0400 Subject: [PATCH 53/53] avoid merge conflict --- doc/src/Section_packages.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt index b0b2d9fa63..7539d99cd0 100644 --- a/doc/src/Section_packages.txt +++ b/doc/src/Section_packages.txt @@ -727,7 +727,8 @@ make lib-latte # print help message make lib-latte args="-b" # download and build in lib/latte/LATTE-master make lib-latte args="-p $HOME/latte" # use existing LATTE installation in $HOME/latte make lib-latte args="-b -m gfortran" # download and build in lib/latte and - # copy Makefile.lammps.gfortran to Makefile.lammps :pre + # copy Makefile.lammps.gfortran to Makefile.lammps +:pre Note that 3 symbolic (soft) links, "includelink" and "liblink" and "filelink", are created in lib/latte to point into the LATTE home dir.