git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15931 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2016-12-14 00:00:20 +00:00
parent 89719fb171
commit 5f04559071
5 changed files with 54 additions and 12 deletions
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@ -63,7 +63,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
  _nbor_pack_width = 1;

  _precision_mode = PREC_MODE_MIXED;
-  _offload_balance = 1.0;
+  _offload_balance = -1.0;
  _overflow_flag[LMP_OVERFLOW] = 0;
  _off_overflow_flag[LMP_OVERFLOW] = 0;

@ -189,10 +189,18 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
      offload_cores = omp_get_num_procs();
      omp_set_num_threads(offload_cores);
      max_offload_threads = omp_get_max_threads();
+      #ifdef __AVX512F__
+      if ( (offload_cores / 4) % 2 == 1) {
+        offload_cores += 4;
+        max_offload_threads += 4;
+      }
+      #endif
    }
    _max_offload_threads = max_offload_threads;
    _offload_cores = offload_cores;
    if (_offload_threads == 0) _offload_threads = offload_cores;
+    if (_offload_cores > 244 && _offload_tpc > 2)
+      _offload_tpc = 2;
  }
  #endif

@ -317,6 +325,8 @@ void FixIntel::init()
    error->all(FLERR,
 	       "Currently, cannot use more than one intel style with hybrid.");

+  neighbor->fix_intel = (void *)this;
+
  check_neighbor_intel();
  if (_precision_mode == PREC_MODE_SINGLE)
    _single_buffers->zero_ev();
@ -1004,8 +1014,10 @@ void FixIntel::set_offload_affinity()
  int offload_threads = _offload_threads;
  int offload_tpc = _offload_tpc;
  int offload_affinity_balanced = _offload_affinity_balanced;
+  int offload_cores = _offload_cores;
  #pragma offload target(mic:_cop) mandatory \
-    in(node_rank,offload_threads,offload_tpc,offload_affinity_balanced)
+    in(node_rank,offload_threads,offload_tpc,offload_affinity_balanced, \
+       offload_cores)
  {
    omp_set_num_threads(offload_threads);
    #pragma omp parallel
@ -1013,20 +1025,24 @@ void FixIntel::set_offload_affinity()
      int tnum = omp_get_thread_num();
      kmp_affinity_mask_t mask;
      kmp_create_affinity_mask(&mask);
-      int proc;
-      if (offload_affinity_balanced) {
-	proc = offload_threads * node_rank + tnum;
+      int proc = offload_threads * node_rank + tnum;
+      #ifdef __AVX512F__
+      proc = (proc / offload_tpc) + (proc % offload_tpc) * 
+	     ((offload_cores) / 4);
+      proc += 68;
+      #else
+      if (offload_affinity_balanced)
 	proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1;
-      } else {
-	proc = offload_threads * node_rank + tnum;
+      else
 	proc += (proc / 4) * (4 - offload_tpc) + 1;
-      }
+      #endif
      kmp_set_affinity_mask_proc(proc, &mask);
      if (kmp_set_affinity(&mask) != 0)
 	printf("Could not set affinity on rank %d thread %d to %d\n",
 	       node_rank, tnum, proc);
    }
  }
+
  if (_precision_mode == PREC_MODE_SINGLE)
    _single_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
  else if (_precision_mode == PREC_MODE_MIXED)
--- a/src/USER-INTEL/intel_intrinsics.h
+++ b/src/USER-INTEL/intel_intrinsics.h
@ -28,7 +28,7 @@
 // implementations.

 // Vector classes provided with the intel compiler
-#ifdef __MIC__
+#if defined(__MIC__) && !defined(__AVX512F__)
 #include <mic/micvec.h>
 #else
 #include <dvec.h> // icc-mmic hates generating movq
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@ -22,6 +22,11 @@
 #ifdef __INTEL_OFFLOAD
 #ifdef LMP_INTEL_OFFLOAD
 #define _LMP_INTEL_OFFLOAD
+#ifdef __TARGET_ARCH_MIC
+#ifndef __MIC__
+#define __MIC__ 1
+#endif
+#endif
 #endif
 #endif

@ -62,6 +67,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_MAX_STENCIL_CHECK 4096
 #define INTEL_P3M_MAXORDER 5

+#ifdef __INTEL_COMPILER
 #ifdef __AVX__
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 8
@ -90,6 +96,13 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #endif
 #endif

+#else
+
+#undef INTEL_VECTOR_WIDTH
+#define INTEL_VECTOR_WIDTH 1
+
+#endif
+
 #define INTEL_DATA_ALIGN 64
 #define INTEL_ONEATOM_FACTOR 2
 #define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH
@ -97,7 +110,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_LB_MEAN_WEIGHT 0.1
 #define INTEL_BIGP 1e15
 #define INTEL_MAX_HOST_CORE_COUNT 512
-#define INTEL_MAX_COI_CORES 2
+#define INTEL_MAX_COI_CORES 36

 #define IP_PRE_get_stride(stride, n, datasize, torque)	\
  {								\
--- a/src/USER-INTEL/verlet_lrt_intel.cpp
+++ b/src/USER-INTEL/verlet_lrt_intel.cpp
@ -43,7 +43,20 @@ using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */

 VerletLRTIntel::VerletLRTIntel(LAMMPS *lmp, int narg, char **arg) :
-  Verlet(lmp, narg, arg) {}
+  Verlet(lmp, narg, arg) {
+  #if defined(_LMP_INTEL_LRT_PTHREAD)
+  pthread_mutex_init(&_kmutex,NULL);
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+VerletLRTIntel::~VerletLRTIntel() 
+{
+  #if defined(_LMP_INTEL_LRT_PTHREAD)
+  pthread_mutex_destroy(&_kmutex);
+  #endif
+}

 /* ----------------------------------------------------------------------
   initialization before run
--- a/src/USER-INTEL/verlet_lrt_intel.h
+++ b/src/USER-INTEL/verlet_lrt_intel.h
@ -40,7 +40,7 @@ namespace LAMMPS_NS {
 class VerletLRTIntel : public Verlet {
 public:
  VerletLRTIntel(class LAMMPS *, int, char **);
-  virtual ~VerletLRTIntel() {}
+  virtual ~VerletLRTIntel();
  virtual void init();
  virtual void setup();
  virtual void run(int);