diff --git a/src/USER-CUDA/Install.sh b/src/USER-CUDA/Install.sh index 3dc143471f..56f0dc80b9 100755 --- a/src/USER-CUDA/Install.sh +++ b/src/USER-CUDA/Install.sh @@ -82,6 +82,12 @@ if (test $1 = 1) then cp pair_eam_alloy_cuda.h .. cp pair_eam_cuda.h .. cp pair_eam_fs_cuda.h .. + cp pair_sw_cuda.h .. + cp pair_sw_cuda.cpp .. + cp pair_tersoff_cuda.h .. + cp pair_tersoff_cuda.cpp .. + cp pair_tersoff_zbl_cuda.h .. + cp pair_tersoff_zbl_cuda.cpp .. fi if (test -e ../pair_gran_hooke.cpp) then @@ -193,12 +199,9 @@ if (test $1 = 1) then cp verlet_cuda.h .. cp cuda.h .. - cp cuda_common.h .. cp cuda_data.h .. cp cuda_modify_flags.h .. cp cuda_neigh_list.h .. - cp cuda_precision.h .. - cp cuda_shared.h .. elif (test $1 = 0) then @@ -341,12 +344,15 @@ elif (test $1 = 0) then rm -f ../pppm_cuda.h rm -f ../verlet_cuda.h + rm -f ../pair_sw_cuda.h + rm -f ../pair_sw_cuda.cpp + rm -f ../pair_tersoff_cuda.h + rm -f ../pair_tersoff_cuda.cpp + rm -f ../pair_tersoff_zbl_cuda.h + rm -f ../pair_tersoff_zbl_cuda.cpp + rm -f ../cuda.h - rm -f ../cuda_common.h rm -f ../cuda_data.h rm -f ../cuda_modify_flags.h rm -f ../cuda_neigh_list.h - rm -f ../cuda_precision.h - rm -f ../cuda_shared.h - fi diff --git a/src/USER-CUDA/comm_cuda.cpp b/src/USER-CUDA/comm_cuda.cpp index 8e75f93ba5..ea4a4ee6a6 100644 --- a/src/USER-CUDA/comm_cuda.cpp +++ b/src/USER-CUDA/comm_cuda.cpp @@ -41,6 +41,9 @@ using namespace LAMMPS_NS; #define BUFFACTOR 1.5 #define BUFMIN 1000 #define BUFEXTRA 1000 + + + #define BIG 1.0e20 enum{SINGLE,MULTI}; @@ -137,6 +140,7 @@ void CommCuda::init() void CommCuda::setup() { + if(cuda->shared_data.pair.neighall) cutghostuser = MAX(2.0*neighbor->cutneighmax,cutghostuser); Comm::setup(); //upload changed geometry to device diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp index 39261bd7c0..819357bc16 100644 --- a/src/USER-CUDA/cuda.cpp +++ b/src/USER-CUDA/cuda.cpp @@ -46,6 +46,8 @@ using namespace LAMMPS_NS; + + Cuda::Cuda(LAMMPS *lmp) : Pointers(lmp) { cuda_exists=true; @@ -309,6 +311,7 @@ void Cuda::setSharedDataZero() shared_data.pair.special_lj = 0; shared_data.pair.special_coul = 0; + shared_data.pair.neighall = false; shared_data.pppm.cudable_force = 0; diff --git a/src/USER-CUDA/cuda_common.h b/src/USER-CUDA/cuda_common.h deleted file mode 100644 index d4687ebd06..0000000000 --- a/src/USER-CUDA/cuda_common.h +++ /dev/null @@ -1,344 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#ifndef _CUDA_COMMON_H_ -#define _CUDA_COMMON_H_ - -//#include "cutil.h" -#include "cuda_precision.h" -#include "cuda_wrapper_cu.h" - -#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types -//this can not be arbitrarly large, since constant space is limited. -//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types -//Christian -#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE) -#define CUDA_MAX_NSPECIAL 25 - -// define some easy-to-use debug and emulation macros -#ifdef _DEBUG -#define MYDBG(a) a -#else -#define MYDBG(a) -#endif - -#if __DEVICE_EMULATION__ -#define MYEMU(a) a -#else -#define MYEMU(a) -#endif - -#define MYEMUDBG(a) MYEMU(MYDBG(a)) - -// Add Prefix (needed as workaround, same constant's names in different files causes conflict) -#define MY_ADD_PREFIX(prefix, var) prefix##_##var -#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var) -#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var) - -#define MY_VAR_TO_STR(var) #var -#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var) -#define MY_CONST(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var)) - -#define CUDA_USE_TEXTURE -#define CUDA_USE_FLOAT4 - -//constants used by many classes - -//domain -#define _boxhi MY_AP(boxhi) -#define _boxlo MY_AP(boxlo) -#define _subhi MY_AP(subhi) -#define _sublo MY_AP(sublo) -#define _box_size MY_AP(box_size) -#define _prd MY_AP(prd) -#define _periodicity MY_AP(periodicity) -#define _triclinic MY_AP(triclinic) -#define _boxhi_lamda MY_AP(boxhi_lamda) -#define _boxlo_lamda MY_AP(boxlo_lamda) -#define _prd_lamda MY_AP(prd_lamda) -#define _h MY_AP(h) -#define _h_inv MY_AP(h_inv) -#define _h_rate MY_AP(h_rate) -__device__ __constant__ X_FLOAT _boxhi[3]; -__device__ __constant__ X_FLOAT _boxlo[3]; -__device__ __constant__ X_FLOAT _subhi[3]; -__device__ __constant__ X_FLOAT _sublo[3]; -__device__ __constant__ X_FLOAT _box_size[3]; -__device__ __constant__ X_FLOAT _prd[3]; -__device__ __constant__ int _periodicity[3]; -__device__ __constant__ int _triclinic; -__device__ __constant__ X_FLOAT _boxhi_lamda[3]; -__device__ __constant__ X_FLOAT _boxlo_lamda[3]; -__device__ __constant__ X_FLOAT _prd_lamda[3]; -__device__ __constant__ X_FLOAT _h[6]; -__device__ __constant__ X_FLOAT _h_inv[6]; -__device__ __constant__ V_FLOAT _h_rate[6]; - - -//atom properties -#define _x MY_AP(x) -#define _v MY_AP(v) -#define _f MY_AP(f) -#define _tag MY_AP(tag) -#define _type MY_AP(type) -#define _mask MY_AP(mask) -#define _image MY_AP(image) -#define _q MY_AP(q) -#define _mass MY_AP(mass) -#define _rmass MY_AP(rmass) -#define _rmass_flag MY_AP(rmass_flag) -#define _eatom MY_AP(eatom) -#define _vatom MY_AP(vatom) -#define _x_type MY_AP(x_type) -#define _radius MY_AP(radius) -#define _density MY_AP(density) -#define _omega MY_AP(omega) -#define _torque MY_AP(torque) -#define _special MY_AP(special) -#define _maxspecial MY_AP(maxspecial) -#define _nspecial MY_AP(nspecial) -#define _special_flag MY_AP(special_flag) -#define _molecule MY_AP(molecule) -#define _v_radius MY_AP(v_radius) -#define _omega_rmass MY_AP(omega_rmass) -#define _freeze_group_bit MY_AP(freeze_group_bit) -#define _map_array MY_AP(map_array) -__device__ __constant__ X_FLOAT* _x; //holds pointer to positions -__device__ __constant__ V_FLOAT* _v; -__device__ __constant__ F_FLOAT* _f; -__device__ __constant__ int* _tag; -__device__ __constant__ int* _type; -__device__ __constant__ int* _mask; -__device__ __constant__ int* _image; -__device__ __constant__ V_FLOAT* _mass; -__device__ __constant__ F_FLOAT* _q; -__device__ __constant__ V_FLOAT* _rmass; -__device__ __constant__ int _rmass_flag; -__device__ __constant__ ENERGY_FLOAT* _eatom; -__device__ __constant__ ENERGY_FLOAT* _vatom; -__device__ __constant__ X_FLOAT4* _x_type; //holds pointer to positions -__device__ __constant__ X_FLOAT* _radius; -__device__ __constant__ F_FLOAT* _density; -__device__ __constant__ V_FLOAT* _omega; -__device__ __constant__ F_FLOAT* _torque; -__device__ __constant__ int* _special; -__device__ __constant__ int _maxspecial; -__device__ __constant__ int* _nspecial; -__device__ __constant__ int _special_flag[4]; -__device__ __constant__ int* _molecule; -__device__ __constant__ V_FLOAT4* _v_radius; //holds pointer to positions -__device__ __constant__ V_FLOAT4* _omega_rmass; //holds pointer to positions -__device__ __constant__ int _freeze_group_bit; -__device__ __constant__ int* _map_array; - -#ifdef CUDA_USE_TEXTURE - - #define _x_tex MY_AP(x_tex) - #if X_PRECISION == 1 - texture _x_tex; - #else - texture _x_tex; - #endif - - #define _type_tex MY_AP(type_tex) - texture _type_tex; - - #define _x_type_tex MY_AP(x_type_tex) - #if X_PRECISION == 1 - texture _x_type_tex; - #else - texture _x_type_tex; - #endif - - #define _v_radius_tex MY_AP(v_radius_tex) - #if V_PRECISION == 1 - texture _v_radius_tex; - #else - texture _v_radius_tex; - #endif - - #define _omega_rmass_tex MY_AP(omega_rmass_tex) - #if V_PRECISION == 1 - texture _omega_rmass_tex; - #else - texture _omega_rmass_tex; - #endif - - #define _q_tex MY_AP(q_tex) - #if F_PRECISION == 1 - texture _q_tex; - #else - texture _q_tex; - #endif - -#endif - -//neighbor -#ifdef IncludeCommonNeigh -#define _inum MY_AP(inum) -#define _inum_border MY_AP(inum_border) -#define _ilist MY_AP(ilist) -#define _ilist_border MY_AP(ilist_border) -#define _numneigh MY_AP(numneigh) -#define _numneigh_border MY_AP(numneigh_border) -#define _numneigh_inner MY_AP(numneigh_inner) -#define _firstneigh MY_AP(firstneigh) -#define _neighbors MY_AP(neighbors) -#define _neighbors_border MY_AP(neighbors_border) -#define _neighbors_inner MY_AP(neighbors_inner) -#define _reneigh_flag MY_AP(reneigh_flag) -#define _triggerneighsq MY_AP(triggerneighsq) -#define _xhold MY_AP(xhold) -#define _maxhold MY_AP(maxhold) -#define _dist_check MY_AP(dist_check) -#define _neighbor_maxlocal MY_AP(neighbor_maxlocal) -#define _maxneighbors MY_AP(maxneighbors) -#define _overlap_comm MY_AP(overlap_comm) -__device__ __constant__ int _inum; -__device__ __constant__ int* _inum_border; -__device__ __constant__ int* _ilist; -__device__ __constant__ int* _ilist_border; -__device__ __constant__ int* _numneigh; -__device__ __constant__ int* _numneigh_border; -__device__ __constant__ int* _numneigh_inner; -__device__ __constant__ int** _firstneigh; -__device__ __constant__ int* _neighbors; -__device__ __constant__ int* _neighbors_border; -__device__ __constant__ int* _neighbors_inner; -__device__ __constant__ int* _reneigh_flag; -__device__ __constant__ X_FLOAT _triggerneighsq; -__device__ __constant__ X_FLOAT* _xhold; //holds pointer to positions -__device__ __constant__ int _maxhold; -__device__ __constant__ int _dist_check; -__device__ __constant__ int _neighbor_maxlocal; -__device__ __constant__ int _maxneighbors; -__device__ __constant__ int _overlap_comm; -#endif - -//system properties -#define _nall MY_AP(nall) -#define _nghost MY_AP(nghost) -#define _nlocal MY_AP(nlocal) -#define _nmax MY_AP(nmax) -#define _cuda_ntypes MY_AP(cuda_ntypes) -#define _dtf MY_AP(dtf) -#define _dtv MY_AP(dtv) -#define _factor MY_AP(factor) -#define _virial MY_AP(virial) -#define _eng_vdwl MY_AP(eng_vdwl) -#define _eng_coul MY_AP(eng_coul) -#define _molecular MY_AP(molecular) -__device__ __constant__ unsigned _nall; -__device__ __constant__ unsigned _nghost; -__device__ __constant__ unsigned _nlocal; -__device__ __constant__ unsigned _nmax; -__device__ __constant__ unsigned _cuda_ntypes; -__device__ __constant__ V_FLOAT _dtf; -__device__ __constant__ X_FLOAT _dtv; -__device__ __constant__ V_FLOAT _factor; -__device__ __constant__ ENERGY_FLOAT* _virial; -__device__ __constant__ ENERGY_FLOAT* _eng_vdwl; -__device__ __constant__ ENERGY_FLOAT* _eng_coul; -__device__ __constant__ int _molecular; - -//other general constants -#define _buffer MY_AP(buffer) -#define _flag MY_AP(flag) -#define _debugdata MY_AP(debugdata) -__device__ __constant__ void* _buffer; -__device__ __constant__ int* _flag; -__device__ __constant__ int* _debugdata; - -// pointers to data fields on GPU are hold in constant space -// -> reduces register usage and number of parameters for kernelcalls -// will be variables of file scope in cuda files - - - - -// maybe used to output cudaError_t -#define MY_OUTPUT_RESULT(result) \ - switch(result) \ - { \ - case cudaSuccess: printf(" => cudaSuccess\n"); break; \ - case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \ - case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \ - case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \ - case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \ - default: printf(" => unknown\n"); break; \ - } - -#ifdef _DEBUG -# define CUT_CHECK_ERROR(errorMessage) { \ - cudaError_t err = cudaGetLastError(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ - exit(EXIT_FAILURE); \ - } \ - err = cudaThreadSynchronize(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ - exit(EXIT_FAILURE); \ - } \ - } -#else -# define CUT_CHECK_ERROR(errorMessage) { \ - cudaError_t err = cudaGetLastError(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ - exit(EXIT_FAILURE); \ - } \ - } -#endif - -# define CUDA_SAFE_CALL_NO_SYNC( call) { \ - cudaError err = call; \ - if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ - __FILE__, __LINE__, cudaGetErrorString( err) ); \ - exit(EXIT_FAILURE); \ - } } - -# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call); - -#define X_MASK 1 -#define V_MASK 2 -#define F_MASK 4 -#define TAG_MASK 8 -#define TYPE_MASK 16 -#define MASK_MASK 32 -#define IMAGE_MASK 64 -#define Q_MASK 128 -#define MOLECULE_MASK 256 -#define RMASS_MASK 512 -#define RADIUS_MASK 1024 -#define DENSITY_MASK 2048 -#define OMEGA_MASK 4096 -#define TORQUE_MASK 8192 - - - -#endif // #ifdef _CUDA_COMMON_H_ diff --git a/src/USER-CUDA/cuda_precision.h b/src/USER-CUDA/cuda_precision.h deleted file mode 100644 index 5b7d6a6843..0000000000 --- a/src/USER-CUDA/cuda_precision.h +++ /dev/null @@ -1,269 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#ifndef CUDA_PRECISION_H_ -#define CUDA_PRECISION_H_ -/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA. - * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation). - * ***_FLOAT: type definition of given property - * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F) - */ - -#ifdef CUDA_USE_BINNING -#define CUDA_IF_BINNING(a) a -#else -#define CUDA_IF_BINNING(a) -#endif - -//GLOBAL - -#ifdef CUDA_PRECISION - #if CUDA_PRECISION == 1 - #define CUDA_FLOAT float - #define CUDA_F(x) x##f - #endif - #if CUDA_PRECISION == 2 - #define CUDA_FLOAT double - #define CUDA_F(x) x - #endif -#endif - -#ifndef CUDA_PRECISION - #define CUDA_FLOAT double - #define CUDA_F(x) x - #define CUDA_PRECISION 2 -#endif -//-------------------------------- -//-----------FFT----------------- -//-------------------------------- - -#ifdef FFT_PRECISION_CU - #if FFT_PRECISION_CU == 1 - #define FFT_FLOAT float - #define FFT_F(x) x##f - #endif - #if FFT_PRECISION_CU == 2 - #define FFT_FLOAT double - #define FFT_F(x) x - #endif -#endif - -#ifndef FFT_PRECISION_CU - #define FFT_FLOAT CUDA_FLOAT - #define FFT_F(x) CUDA_F(x) - #define FFT_PRECISION_CU CUDA_PRECISION -#endif - -//-------------------------------- -//-----------PPPM----------------- -//-------------------------------- - -#ifdef PPPM_PRECISION - #if PPPM_PRECISION == 1 - #define PPPM_FLOAT float - #define PPPM_F(x) x##f - #endif - #if PPPM_PRECISION == 2 - #define PPPM_FLOAT double - #define PPPM_F(x) x - #endif -#endif - -#ifndef PPPM_PRECISION - #define PPPM_FLOAT CUDA_FLOAT - #define PPPM_F(x) CUDA_F(x) - #define PPPM_PRECISION CUDA_PRECISION -#endif - -//-------------------------------- -//-----------FORCE----------------- -//-------------------------------- - - -#ifdef F_PRECISION - #if F_PRECISION == 1 - #define F_FLOAT float - #define F_F(x) x##f - #endif - #if F_PRECISION == 2 - #define F_FLOAT double - #define F_F(x) x - #endif -#endif - -#ifndef F_PRECISION - #define F_FLOAT CUDA_FLOAT - #define F_F(x) CUDA_F(x) - #define F_PRECISION CUDA_PRECISION -#endif - -#if F_PRECISION == 1 -#define _SQRT_ sqrtf -#define _RSQRT_ rsqrtf -#define _EXP_ expf -#else -#define _SQRT_ sqrt -#define _RSQRT_ rsqrt -#define _EXP_ exp -#endif - -#if F_PRECISION == 2 -struct F_FLOAT2 -{ - F_FLOAT x; - F_FLOAT y; -}; -struct F_FLOAT3 -{ - F_FLOAT x; - F_FLOAT y; - F_FLOAT z; -}; -struct F_FLOAT4 -{ - F_FLOAT x; - F_FLOAT y; - F_FLOAT z; - F_FLOAT w; -}; -#else -#define F_FLOAT2 float2 -#define F_FLOAT3 float3 -#define F_FLOAT4 float4 -#endif -//-------------------------------- -//-----------ENERGY----------------- -//-------------------------------- - -#ifndef ENERGY_PRECISION - #define ENERGY_FLOAT CUDA_FLOAT - #define ENERGY_F(x) CUDA_F(x) -#endif - -#ifdef ENERGY_PRECISION - #if ENERGY_PRECISION == 1 - #define ENERGY_FLOAT float - #define ENERGY_F(x) x##f - #endif - #if ENERGY_PRECISION == 2 - #define ENERGY_FLOAT double - #define ENERGY_F(x) x - #endif -#endif - -#ifndef ENERGY_PRECISION - #define ENERGY_FLOAT CUDA_FLOAT - #define ENERGY_F(x) CUDA_F(x) - #define ENERGY_PRECISION CUDA_PRECISION -#endif - -//-------------------------------- -//-----------POSITIONS------------ -//-------------------------------- - -#ifdef X_PRECISION - #if X_PRECISION == 1 - #define X_FLOAT float - #define X_F(x) x##f - #endif - #if X_PRECISION == 2 - #define X_FLOAT double - #define X_F(x) x - #endif -#endif - -#ifndef X_PRECISION - #define X_FLOAT CUDA_FLOAT - #define X_F(x) CUDA_F(x) - #define X_PRECISION CUDA_PRECISION -#endif - -#if X_PRECISION == 2 -struct X_FLOAT2 -{ - X_FLOAT x; - X_FLOAT y; -}; -struct X_FLOAT3 -{ - X_FLOAT x; - X_FLOAT y; - X_FLOAT z; -}; -struct X_FLOAT4 -{ - X_FLOAT x; - X_FLOAT y; - X_FLOAT z; - X_FLOAT w; -}; -#else -#define X_FLOAT2 float2 -#define X_FLOAT3 float3 -#define X_FLOAT4 float4 -#endif - -//-------------------------------- -//-----------velocities----------- -//-------------------------------- - -#ifdef V_PRECISION - #if V_PRECISION == 1 - #define V_FLOAT float - #define V_F(x) x##f - #endif - #if V_PRECISION == 2 - #define V_FLOAT double - #define V_F(x) x - #endif -#endif - -#ifndef V_PRECISION - #define V_FLOAT CUDA_FLOAT - #define V_F(x) CUDA_F(x) - #define V_PRECISION CUDA_PRECISION -#endif - -#if V_PRECISION == 2 -struct V_FLOAT4 -{ - V_FLOAT x; - V_FLOAT y; - V_FLOAT z; - V_FLOAT w; -}; -#else -#define V_FLOAT4 float4 -#endif - -#ifdef NO_PREC_TIMING -struct timespec_2 -{ - unsigned int tv_sec; - unsigned int tv_nsec; -}; - -#define timespec timespec_2 -#define clock_gettime(a,b) -#endif -#endif /*CUDA_PRECISION_H_*/ diff --git a/src/USER-CUDA/cuda_shared.h b/src/USER-CUDA/cuda_shared.h deleted file mode 100644 index f7983fff05..0000000000 --- a/src/USER-CUDA/cuda_shared.h +++ /dev/null @@ -1,378 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#ifndef _CUDA_SHARED_H_ -#define _CUDA_SHARED_H_ -#include "cuda_precision.h" - -#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int) - -struct dev_array -{ - void* dev_data; // pointer to memory address on cuda device - unsigned dim[3]; // array dimensions -}; - -struct cuda_shared_atom // relevent data from atom class -{ - dev_array dx; // cumulated distance for binning settings - dev_array x; // position - dev_array v; // velocity - dev_array f; // force - dev_array tag; - dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1) - dev_array mask; - dev_array image; - dev_array q; // charges - dev_array mass; // per-type masses - dev_array rmass; // per-atom masses - dev_array radius; // per-atom radius - dev_array density; - dev_array omega; - dev_array torque; - dev_array molecule; - - dev_array special; - int maxspecial; - dev_array nspecial; - int* special_flag; - int molecular; - - dev_array eatom; // per-atom energy - dev_array vatom; // per-atom virial - int need_eatom; - int need_vatom; - - dev_array x_type; // position + type in X_FLOAT4 struct - dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style - dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style - - double* mass_host; // remember per-type host pointer to masses - //int natoms; // total # of atoms in system, could be 0 - int nghost; // and ghost atoms on this proc - int nlocal; // # of owned - int nall; // total # of atoms in this proc - int nmax; // max # of owned+ghost in arrays on this proc - int ntypes; - int q_flag; // do we have charges? - int rmass_flag; // do we have per-atom masses? - int firstgroup; - int nfirst; - - int update_nlocal; - int update_nmax; - - dev_array xhold; // position at last neighboring - X_FLOAT triggerneighsq; // maximum square movement before reneighboring - int reneigh_flag; // is reneighboring necessary - int maxhold; // size of xhold - int dist_check; //perform distance check for reneighboring - dev_array binned_id; //id of each binned atom (not tag!!) - dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]] - float bin_extraspace; - int bin_dim[3]; - int bin_nmax; - dev_array map_array; -}; - -struct cuda_shared_pair // relevent data from pair class -{ - char cudable_force; // check for (cudable_force!=0) - X_FLOAT cut_global; - X_FLOAT cut_inner_global; - X_FLOAT cut_coul_global; - double** cut; // type-type cutoff - double** cutsq; // type-type cutoff - double** cut_inner; // type-type cutoff for coul - double** cut_coul; // type-type cutoff for coul - double** coeff1; // tpye-type pair parameters - double** coeff2; - double** coeff3; - double** coeff4; - double** coeff5; - double** coeff6; - double** coeff7; - double** coeff8; - double** coeff9; - double** coeff10; - double** offset; - double* special_lj; - double* special_coul; - dev_array virial; // ENERGY_FLOAT - dev_array eng_vdwl; // ENERGY_FLOAT - dev_array eng_coul; // ENERGY_FLOAT - X_FLOAT cut_coulsq_global; - F_FLOAT g_ewald,kappa; - int freeze_group_bit; - - dev_array coeff1_gm; - dev_array coeff2_gm; - dev_array coeff3_gm; - dev_array coeff4_gm; - dev_array coeff5_gm; - dev_array coeff6_gm; - dev_array coeff7_gm; - dev_array coeff8_gm; - dev_array coeff9_gm; - dev_array coeff10_gm; - - int lastgridsize; - int n_energy_virial; - int collect_forces_later; - int use_block_per_atom; - int override_block_per_atom; - -}; - -struct cuda_shared_domain // relevent data from domain class -{ - X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc - X_FLOAT subhi[3]; - X_FLOAT boxlo[3]; - X_FLOAT boxhi[3]; - X_FLOAT prd[3]; - int periodicity[3]; // xyz periodicity as array - - int triclinic; - X_FLOAT xy; - X_FLOAT xz; - X_FLOAT yz; - X_FLOAT boxlo_lamda[3]; - X_FLOAT boxhi_lamda[3]; - X_FLOAT prd_lamda[3]; - X_FLOAT h[6]; - X_FLOAT h_inv[6]; - V_FLOAT h_rate[6]; - int update; -}; - -struct cuda_shared_pppm -{ - char cudable_force; -#ifdef FFT_CUFFT - FFT_FLOAT* work1; - FFT_FLOAT* work2; - FFT_FLOAT* work3; - PPPM_FLOAT* greensfn; - PPPM_FLOAT* fkx; - PPPM_FLOAT* fky; - PPPM_FLOAT* fkz; - PPPM_FLOAT* vg; -#endif - int* part2grid; - PPPM_FLOAT* density_brick; - int* density_brick_int; - PPPM_FLOAT density_intScale; - PPPM_FLOAT* vdx_brick; - PPPM_FLOAT* vdy_brick; - PPPM_FLOAT* vdz_brick; - PPPM_FLOAT* density_fft; - ENERGY_FLOAT* energy; - ENERGY_FLOAT* virial; - int nxlo_in; - int nxhi_in; - int nxlo_out; - int nxhi_out; - int nylo_in; - int nyhi_in; - int nylo_out; - int nyhi_out; - int nzlo_in; - int nzhi_in; - int nzlo_out; - int nzhi_out; - int nx_pppm; - int ny_pppm; - int nz_pppm; - PPPM_FLOAT qqrd2e; - int order; - // float3 sublo; - PPPM_FLOAT* rho_coeff; - int nmax; - int nlocal; - PPPM_FLOAT* debugdata; - PPPM_FLOAT delxinv; - PPPM_FLOAT delyinv; - PPPM_FLOAT delzinv; - int nlower; - int nupper; - PPPM_FLOAT shiftone; - -}; - -struct cuda_shared_comm -{ - int maxswap; - int maxlistlength; - dev_array pbc; - dev_array slablo; - dev_array slabhi; - dev_array multilo; - dev_array multihi; - dev_array sendlist; - int grow_flag; - int comm_phase; - - int nsend; - int* nsend_swap; - int* send_size; - int* recv_size; - double** buf_send; - void** buf_send_dev; - double** buf_recv; - void** buf_recv_dev; - void* buffer; - int buffer_size; - double overlap_split_ratio; -}; - -struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data -{ - int maxlocal; - int inum; // # of I atoms neighbors are stored for local indices of I atoms - int inum_border2; - dev_array inum_border; // # of atoms which interact with border atoms - dev_array ilist; - dev_array ilist_border; - dev_array numneigh; - dev_array numneigh_inner; - dev_array numneigh_border; - dev_array firstneigh; - dev_array neighbors; - dev_array neighbors_border; - dev_array neighbors_inner; - int maxpage; - dev_array page_pointers; - dev_array* pages; - int maxneighbors; - int neigh_lists_per_page; - double** cutneighsq; - CUDA_FLOAT* cu_cutneighsq; - int* binned_id; - int* bin_dim; - int bin_nmax; - float bin_extraspace; - double maxcut; - dev_array ex_type; - int nex_type; - dev_array ex1_bit; - dev_array ex2_bit; - int nex_group; - dev_array ex_mol_bit; - int nex_mol; - -}; - -struct cuda_compile_settings // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files -{ - int prec_glob; - int prec_x; - int prec_v; - int prec_f; - int prec_pppm; - int prec_fft; - int cufft; - int arch; -}; - -struct cuda_timings_struct -{ - //Debug: - double test1; - double test2; - //transfers - double transfer_upload_tmp_constr; - double transfer_download_tmp_deconstr; - - //communication - double comm_forward_total; - double comm_forward_mpi_upper; - double comm_forward_mpi_lower; - double comm_forward_kernel_pack; - double comm_forward_kernel_unpack; - double comm_forward_kernel_self; - double comm_forward_upload; - double comm_forward_download; - - double comm_exchange_total; - double comm_exchange_mpi; - double comm_exchange_kernel_pack; - double comm_exchange_kernel_unpack; - double comm_exchange_kernel_fill; - double comm_exchange_cpu_pack; - double comm_exchange_upload; - double comm_exchange_download; - - double comm_border_total; - double comm_border_mpi; - double comm_border_kernel_pack; - double comm_border_kernel_unpack; - double comm_border_kernel_self; - double comm_border_kernel_buildlist; - double comm_border_upload; - double comm_border_download; - - //pair forces - double pair_xtype_conversion; - double pair_kernel; - double pair_virial; - double pair_force_collection; - - //neighbor - double neigh_bin; - double neigh_build; - double neigh_special; - - //PPPM - double pppm_particle_map; - double pppm_make_rho; - double pppm_brick2fft; - double pppm_poisson; - double pppm_fillbrick; - double pppm_fieldforce; - double pppm_compute; - -}; - -struct cuda_shared_data // holds space for all relevent data from the different classes -{ - void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine] - int buffersize; //maxsize of buffer - int buffer_new; //should be 1 if the pointer to buffer has changed - void* flag; - void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array - cuda_shared_atom atom; - cuda_shared_pair pair; - cuda_shared_domain domain; - cuda_shared_pppm pppm; - cuda_shared_comm comm; - cuda_compile_settings compile_settings; - cuda_timings_struct cuda_timings; - int exchange_dim; - int me; //mpi rank - unsigned int datamask; - int overlap_comm; -}; - - -#endif // #ifndef _CUDA_SHARED_H_ diff --git a/src/USER-CUDA/neighbor_cuda.cpp b/src/USER-CUDA/neighbor_cuda.cpp index 99bf2dce3c..dc5af9f2f8 100644 --- a/src/USER-CUDA/neighbor_cuda.cpp +++ b/src/USER-CUDA/neighbor_cuda.cpp @@ -26,6 +26,9 @@ using namespace LAMMPS_NS; + + + enum{NSQ,BIN,MULTI}; // also in neigh_list.cpp /* ---------------------------------------------------------------------- */ @@ -56,9 +59,9 @@ void NeighborCuda::choose_build(int index, NeighRequest *rq) { Neighbor::choose_build(index,rq); - if (rq->full && style == NSQ && rq->ghost == 0 && rq->cudable) + if (rq->full && style == NSQ && rq->cudable) pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_nsq_cuda; - else if (rq->full && style == BIN && rq->ghost == 0 && rq->cudable) + else if (rq->full && style == BIN && rq->cudable) pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_bin_cuda; } diff --git a/src/USER-CUDA/verlet_cuda.cpp b/src/USER-CUDA/verlet_cuda.cpp index 0c3f675fae..fbaa1800a5 100644 --- a/src/USER-CUDA/verlet_cuda.cpp +++ b/src/USER-CUDA/verlet_cuda.cpp @@ -21,6 +21,7 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ + #include #include #include @@ -56,6 +57,7 @@ using namespace LAMMPS_NS; #define MAKETIMEING + VerletCuda::VerletCuda(LAMMPS *lmp, int narg, char **arg) : Verlet(lmp, narg, arg) { cuda = lmp->cuda; if(cuda == NULL) @@ -132,20 +134,19 @@ void VerletCuda::setup() cuda->uploadAll(); neighbor->build(); neighbor->ncalls = 0; - cuda->uploadAllNeighborLists(); + if(atom->mass) cuda->cu_mass->upload(); if(cuda->cu_map_array) cuda->cu_map_array->upload(); - + // compute all forces ev_set(update->ntimestep); if(elist_atom) cuda->shared_data.atom.need_eatom = 1; if(vlist_atom) cuda->shared_data.atom.need_vatom = 1; if(elist_atom||vlist_atom) cuda->checkResize(); - int test_BpA_vs_TpA = true; timespec starttime;