Changes from Mike Brown.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa
2010-11-23 00:40:35 +00:00
parent ae536ce7d0
commit 5a82c99485
130 changed files with 24967 additions and 4802 deletions
--- a/lib/gpu/gb_gpu_kernel_nbor.cu
+++ b/lib/gpu/gb_gpu_kernel_nbor.cu
@ -0,0 +1,170 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef PAIR_GPU_KERNEL_H
+#define PAIR_GPU_KERNEL_H
+
+#define MAX_SHARED_TYPES 8
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#else
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#endif
+
+#ifdef NV_KERNEL
+
+#include "geryon/ucl_nv_kernel.h"
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+
+#endif
+
+// ---------------------------------------------------------------------------
+// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
+// -- Only unpack neighbors matching the specified inclusive range of forms
+// -- Only unpack neighbors within cutoff
+// ---------------------------------------------------------------------------
+__kernel void kernel_gb_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, 
+                             const int ntypes, __global int *dev_nbor,
+                             const int nbor_pitch, 
+                             const int start, const int inum, 
+                             __global int *dev_ij, const int form_low, 
+                             const int form_high, const int nall) {
+                                
+  // ii indexes the two interacting particles in gi
+  int ii=GLOBAL_ID_X+start;
+
+  if (ii<inum) {
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
+  
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=mul24(iw,ntypes);
+    int newj=0;  
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+      int j=*nbor;
+      if (j>=nall)
+        j%=nall;
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+      int mtype=itype+jtype;
+      numtyp2 cf=cut_form[mtype];
+      if (cf.y>=form_low && cf.y<=form_high) {
+        // Compute r12;
+        numtyp rsq=jx.x-ix.x;
+        rsq*=rsq;
+        numtyp t=jx.y-ix.y;
+        rsq+=t*t;
+        t=jx.z-ix.z;
+        rsq+=t*t;
+
+        if (rsq<cf.x) {
+          *packed=j;
+          packed+=nbor_pitch;
+          newj++;
+        }
+      }
+    }
+    dev_nbor[ii+nbor_pitch]=newj;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
+// -- Only unpack neighbors matching the specified inclusive range of forms
+// -- Only unpack neighbors within cutoff
+// -- Fast version of routine that uses shared memory for LJ constants
+// ---------------------------------------------------------------------------
+__kernel void kernel_gb_nbor_fast(__global numtyp4 *x_, 
+                                  __global numtyp2 *cut_form,
+                                  __global int *dev_nbor, 
+                                  const int nbor_pitch, 
+                                  const int start, const int inum, 
+                                  __global int *dev_ij, const int form_low, 
+                                  const int form_high, const int nall) {
+                                
+  int ii=THREAD_ID_X;
+  __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    cutsq[ii]=cut_form[ii].x;
+    form[ii]=cut_form[ii].y;
+  }
+  ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
+  __syncthreads();
+
+  if (ii<inum) {
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
+  
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    int newj=0;  
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+      int j=*nbor;
+      if (j>=nall)
+        j%=nall;
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+      int mtype=itype+jtype;
+      
+      if (form[mtype]>=form_low && form[mtype]<=form_high) {
+        // Compute r12;
+        numtyp rsq=jx.x-ix.x;
+        rsq*=rsq;
+        numtyp t=jx.y-ix.y;
+        rsq+=t*t;
+        t=jx.z-ix.z;
+        rsq+=t*t;
+
+        if (rsq<cutsq[mtype]) {
+          *packed=j;
+          packed+=nbor_pitch;
+          newj++;
+        }
+      }
+    }
+    dev_nbor[ii+nbor_pitch]=newj;
+  }
+}
+
+#endif