git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6053 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-05-02 15:02:52 +00:00
parent 2be078632d
commit 5f799182b3
70 changed files with 4489 additions and 2253 deletions
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef LJ96_GPU_KERNEL
 #define LJ96_GPU_KERNEL

-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -46,7 +44,7 @@

 #ifdef NV_KERNEL

-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;

 #ifdef _DOUBLE_DOUBLE
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define __inline inline

 #define fetch_pos(i,y) x_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8

 #endif

@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp sp_lj[4];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
  
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -157,8 +172,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -176,49 +230,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,

 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               const int nall, const int nbor_pitch,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
-  if (ii<4)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
  __syncthreads();
  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
  
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -258,8 +328,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;