diff --git a/lib/cuda/cuda_common.h b/lib/cuda/cuda_common.h new file mode 100644 index 0000000000..d4687ebd06 --- /dev/null +++ b/lib/cuda/cuda_common.h @@ -0,0 +1,344 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef _CUDA_COMMON_H_ +#define _CUDA_COMMON_H_ + +//#include "cutil.h" +#include "cuda_precision.h" +#include "cuda_wrapper_cu.h" + +#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types +//this can not be arbitrarly large, since constant space is limited. +//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types +//Christian +#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE) +#define CUDA_MAX_NSPECIAL 25 + +// define some easy-to-use debug and emulation macros +#ifdef _DEBUG +#define MYDBG(a) a +#else +#define MYDBG(a) +#endif + +#if __DEVICE_EMULATION__ +#define MYEMU(a) a +#else +#define MYEMU(a) +#endif + +#define MYEMUDBG(a) MYEMU(MYDBG(a)) + +// Add Prefix (needed as workaround, same constant's names in different files causes conflict) +#define MY_ADD_PREFIX(prefix, var) prefix##_##var +#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var) +#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var) + +#define MY_VAR_TO_STR(var) #var +#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var) +#define MY_CONST(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var)) + +#define CUDA_USE_TEXTURE +#define CUDA_USE_FLOAT4 + +//constants used by many classes + +//domain +#define _boxhi MY_AP(boxhi) +#define _boxlo MY_AP(boxlo) +#define _subhi MY_AP(subhi) +#define _sublo MY_AP(sublo) +#define _box_size MY_AP(box_size) +#define _prd MY_AP(prd) +#define _periodicity MY_AP(periodicity) +#define _triclinic MY_AP(triclinic) +#define _boxhi_lamda MY_AP(boxhi_lamda) +#define _boxlo_lamda MY_AP(boxlo_lamda) +#define _prd_lamda MY_AP(prd_lamda) +#define _h MY_AP(h) +#define _h_inv MY_AP(h_inv) +#define _h_rate MY_AP(h_rate) +__device__ __constant__ X_FLOAT _boxhi[3]; +__device__ __constant__ X_FLOAT _boxlo[3]; +__device__ __constant__ X_FLOAT _subhi[3]; +__device__ __constant__ X_FLOAT _sublo[3]; +__device__ __constant__ X_FLOAT _box_size[3]; +__device__ __constant__ X_FLOAT _prd[3]; +__device__ __constant__ int _periodicity[3]; +__device__ __constant__ int _triclinic; +__device__ __constant__ X_FLOAT _boxhi_lamda[3]; +__device__ __constant__ X_FLOAT _boxlo_lamda[3]; +__device__ __constant__ X_FLOAT _prd_lamda[3]; +__device__ __constant__ X_FLOAT _h[6]; +__device__ __constant__ X_FLOAT _h_inv[6]; +__device__ __constant__ V_FLOAT _h_rate[6]; + + +//atom properties +#define _x MY_AP(x) +#define _v MY_AP(v) +#define _f MY_AP(f) +#define _tag MY_AP(tag) +#define _type MY_AP(type) +#define _mask MY_AP(mask) +#define _image MY_AP(image) +#define _q MY_AP(q) +#define _mass MY_AP(mass) +#define _rmass MY_AP(rmass) +#define _rmass_flag MY_AP(rmass_flag) +#define _eatom MY_AP(eatom) +#define _vatom MY_AP(vatom) +#define _x_type MY_AP(x_type) +#define _radius MY_AP(radius) +#define _density MY_AP(density) +#define _omega MY_AP(omega) +#define _torque MY_AP(torque) +#define _special MY_AP(special) +#define _maxspecial MY_AP(maxspecial) +#define _nspecial MY_AP(nspecial) +#define _special_flag MY_AP(special_flag) +#define _molecule MY_AP(molecule) +#define _v_radius MY_AP(v_radius) +#define _omega_rmass MY_AP(omega_rmass) +#define _freeze_group_bit MY_AP(freeze_group_bit) +#define _map_array MY_AP(map_array) +__device__ __constant__ X_FLOAT* _x; //holds pointer to positions +__device__ __constant__ V_FLOAT* _v; +__device__ __constant__ F_FLOAT* _f; +__device__ __constant__ int* _tag; +__device__ __constant__ int* _type; +__device__ __constant__ int* _mask; +__device__ __constant__ int* _image; +__device__ __constant__ V_FLOAT* _mass; +__device__ __constant__ F_FLOAT* _q; +__device__ __constant__ V_FLOAT* _rmass; +__device__ __constant__ int _rmass_flag; +__device__ __constant__ ENERGY_FLOAT* _eatom; +__device__ __constant__ ENERGY_FLOAT* _vatom; +__device__ __constant__ X_FLOAT4* _x_type; //holds pointer to positions +__device__ __constant__ X_FLOAT* _radius; +__device__ __constant__ F_FLOAT* _density; +__device__ __constant__ V_FLOAT* _omega; +__device__ __constant__ F_FLOAT* _torque; +__device__ __constant__ int* _special; +__device__ __constant__ int _maxspecial; +__device__ __constant__ int* _nspecial; +__device__ __constant__ int _special_flag[4]; +__device__ __constant__ int* _molecule; +__device__ __constant__ V_FLOAT4* _v_radius; //holds pointer to positions +__device__ __constant__ V_FLOAT4* _omega_rmass; //holds pointer to positions +__device__ __constant__ int _freeze_group_bit; +__device__ __constant__ int* _map_array; + +#ifdef CUDA_USE_TEXTURE + + #define _x_tex MY_AP(x_tex) + #if X_PRECISION == 1 + texture _x_tex; + #else + texture _x_tex; + #endif + + #define _type_tex MY_AP(type_tex) + texture _type_tex; + + #define _x_type_tex MY_AP(x_type_tex) + #if X_PRECISION == 1 + texture _x_type_tex; + #else + texture _x_type_tex; + #endif + + #define _v_radius_tex MY_AP(v_radius_tex) + #if V_PRECISION == 1 + texture _v_radius_tex; + #else + texture _v_radius_tex; + #endif + + #define _omega_rmass_tex MY_AP(omega_rmass_tex) + #if V_PRECISION == 1 + texture _omega_rmass_tex; + #else + texture _omega_rmass_tex; + #endif + + #define _q_tex MY_AP(q_tex) + #if F_PRECISION == 1 + texture _q_tex; + #else + texture _q_tex; + #endif + +#endif + +//neighbor +#ifdef IncludeCommonNeigh +#define _inum MY_AP(inum) +#define _inum_border MY_AP(inum_border) +#define _ilist MY_AP(ilist) +#define _ilist_border MY_AP(ilist_border) +#define _numneigh MY_AP(numneigh) +#define _numneigh_border MY_AP(numneigh_border) +#define _numneigh_inner MY_AP(numneigh_inner) +#define _firstneigh MY_AP(firstneigh) +#define _neighbors MY_AP(neighbors) +#define _neighbors_border MY_AP(neighbors_border) +#define _neighbors_inner MY_AP(neighbors_inner) +#define _reneigh_flag MY_AP(reneigh_flag) +#define _triggerneighsq MY_AP(triggerneighsq) +#define _xhold MY_AP(xhold) +#define _maxhold MY_AP(maxhold) +#define _dist_check MY_AP(dist_check) +#define _neighbor_maxlocal MY_AP(neighbor_maxlocal) +#define _maxneighbors MY_AP(maxneighbors) +#define _overlap_comm MY_AP(overlap_comm) +__device__ __constant__ int _inum; +__device__ __constant__ int* _inum_border; +__device__ __constant__ int* _ilist; +__device__ __constant__ int* _ilist_border; +__device__ __constant__ int* _numneigh; +__device__ __constant__ int* _numneigh_border; +__device__ __constant__ int* _numneigh_inner; +__device__ __constant__ int** _firstneigh; +__device__ __constant__ int* _neighbors; +__device__ __constant__ int* _neighbors_border; +__device__ __constant__ int* _neighbors_inner; +__device__ __constant__ int* _reneigh_flag; +__device__ __constant__ X_FLOAT _triggerneighsq; +__device__ __constant__ X_FLOAT* _xhold; //holds pointer to positions +__device__ __constant__ int _maxhold; +__device__ __constant__ int _dist_check; +__device__ __constant__ int _neighbor_maxlocal; +__device__ __constant__ int _maxneighbors; +__device__ __constant__ int _overlap_comm; +#endif + +//system properties +#define _nall MY_AP(nall) +#define _nghost MY_AP(nghost) +#define _nlocal MY_AP(nlocal) +#define _nmax MY_AP(nmax) +#define _cuda_ntypes MY_AP(cuda_ntypes) +#define _dtf MY_AP(dtf) +#define _dtv MY_AP(dtv) +#define _factor MY_AP(factor) +#define _virial MY_AP(virial) +#define _eng_vdwl MY_AP(eng_vdwl) +#define _eng_coul MY_AP(eng_coul) +#define _molecular MY_AP(molecular) +__device__ __constant__ unsigned _nall; +__device__ __constant__ unsigned _nghost; +__device__ __constant__ unsigned _nlocal; +__device__ __constant__ unsigned _nmax; +__device__ __constant__ unsigned _cuda_ntypes; +__device__ __constant__ V_FLOAT _dtf; +__device__ __constant__ X_FLOAT _dtv; +__device__ __constant__ V_FLOAT _factor; +__device__ __constant__ ENERGY_FLOAT* _virial; +__device__ __constant__ ENERGY_FLOAT* _eng_vdwl; +__device__ __constant__ ENERGY_FLOAT* _eng_coul; +__device__ __constant__ int _molecular; + +//other general constants +#define _buffer MY_AP(buffer) +#define _flag MY_AP(flag) +#define _debugdata MY_AP(debugdata) +__device__ __constant__ void* _buffer; +__device__ __constant__ int* _flag; +__device__ __constant__ int* _debugdata; + +// pointers to data fields on GPU are hold in constant space +// -> reduces register usage and number of parameters for kernelcalls +// will be variables of file scope in cuda files + + + + +// maybe used to output cudaError_t +#define MY_OUTPUT_RESULT(result) \ + switch(result) \ + { \ + case cudaSuccess: printf(" => cudaSuccess\n"); break; \ + case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \ + case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \ + case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \ + case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \ + default: printf(" => unknown\n"); break; \ + } + +#ifdef _DEBUG +# define CUT_CHECK_ERROR(errorMessage) { \ + cudaError_t err = cudaGetLastError(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } \ + err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } \ + } +#else +# define CUT_CHECK_ERROR(errorMessage) { \ + cudaError_t err = cudaGetLastError(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } \ + } +#endif + +# define CUDA_SAFE_CALL_NO_SYNC( call) { \ + cudaError err = call; \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + exit(EXIT_FAILURE); \ + } } + +# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call); + +#define X_MASK 1 +#define V_MASK 2 +#define F_MASK 4 +#define TAG_MASK 8 +#define TYPE_MASK 16 +#define MASK_MASK 32 +#define IMAGE_MASK 64 +#define Q_MASK 128 +#define MOLECULE_MASK 256 +#define RMASS_MASK 512 +#define RADIUS_MASK 1024 +#define DENSITY_MASK 2048 +#define OMEGA_MASK 4096 +#define TORQUE_MASK 8192 + + + +#endif // #ifdef _CUDA_COMMON_H_