git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8904 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -14,7 +14,8 @@ SHELL = /bin/sh
|
||||
|
||||
# System-specific settings
|
||||
|
||||
CUDA_INSTALL_PATH = /usr/local/cuda
|
||||
#CUDA_INSTALL_PATH = /usr/local/cuda
|
||||
CUDA_INSTALL_PATH = /home/crtrott/lib/cuda
|
||||
# e.g. in Gentoo
|
||||
# CUDA_INSTALL_PATH = /opt/cuda
|
||||
|
||||
@ -96,12 +97,26 @@ else
|
||||
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
|
||||
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
|
||||
SMVERSIONFLAGS := -arch sm_21
|
||||
else
|
||||
ifeq ($(strip $(arch)), 30)
|
||||
CUDA_FLAGS += -DCUDA_ARCH=20
|
||||
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
|
||||
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
|
||||
SMVERSIONFLAGS := -arch sm_30
|
||||
else
|
||||
ifeq ($(strip $(arch)), 35)
|
||||
CUDA_FLAGS += -DCUDA_ARCH=20
|
||||
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
|
||||
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
|
||||
SMVERSIONFLAGS := -arch sm_35
|
||||
else
|
||||
CUDA_FLAGS += -DCUDA_ARCH=99
|
||||
SMVERSIONFLAGS := -arch sm_13
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@ -32,6 +32,7 @@ inline int3 getgrid(int n,int shared_per_thread=0,int threadsmax=256, bool p2=fa
|
||||
{
|
||||
int3 gridparams;
|
||||
int sharedsize = 16000;
|
||||
|
||||
if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax;
|
||||
|
||||
if((n < 60 * 32) || (threadsmax < 64))
|
||||
@ -44,20 +45,26 @@ inline int3 getgrid(int n,int shared_per_thread=0,int threadsmax=256, bool p2=fa
|
||||
gridparams.z = 256;
|
||||
else gridparams.z = 512;
|
||||
|
||||
if(p2)
|
||||
{
|
||||
if(p2) {
|
||||
gridparams.z = 16;
|
||||
|
||||
while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2;
|
||||
}
|
||||
|
||||
|
||||
int blocks = (n + gridparams.z - 1) / gridparams.z;
|
||||
|
||||
if(blocks > 10000)
|
||||
gridparams.x = gridparams.y = int(sqrt(blocks));
|
||||
else
|
||||
{gridparams.x=blocks; gridparams.y=1;}
|
||||
else {
|
||||
gridparams.x = blocks;
|
||||
gridparams.y = 1;
|
||||
}
|
||||
|
||||
while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++;
|
||||
|
||||
if(gridparams.x == 0) gridparams.x = 1;
|
||||
|
||||
return gridparams;
|
||||
}
|
||||
|
||||
@ -80,14 +87,15 @@ static inline __device__ void copySharedToGlob(int* shared, int* glob,const int&
|
||||
{
|
||||
int i, k;
|
||||
k = n - blockDim.x;
|
||||
for(i=0;i<k;i+=blockDim.x)
|
||||
{
|
||||
|
||||
for(i = 0; i < k; i += blockDim.x) {
|
||||
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
||||
}
|
||||
if(threadIdx.x<n-i)
|
||||
{
|
||||
|
||||
if(threadIdx.x < n - i) {
|
||||
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@ -95,14 +103,15 @@ static inline __device__ void copySharedToGlob(float* shared, float* glob,const
|
||||
{
|
||||
int i, k;
|
||||
k = n - blockDim.x;
|
||||
for(i=0;i<k;i+=blockDim.x)
|
||||
{
|
||||
|
||||
for(i = 0; i < k; i += blockDim.x) {
|
||||
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
||||
}
|
||||
if(threadIdx.x<n-i)
|
||||
{
|
||||
|
||||
if(threadIdx.x < n - i) {
|
||||
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@ -110,14 +119,15 @@ static inline __device__ void copySharedToGlob(double* shared, double* glob,cons
|
||||
{
|
||||
int i, k;
|
||||
k = n - blockDim.x;
|
||||
for(i=0;i<k;i+=blockDim.x)
|
||||
{
|
||||
|
||||
for(i = 0; i < k; i += blockDim.x) {
|
||||
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
||||
}
|
||||
if(threadIdx.x<n-i)
|
||||
{
|
||||
|
||||
if(threadIdx.x < n - i) {
|
||||
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@ -125,14 +135,15 @@ static inline __device__ void copyGlobToShared(int* glob,int* shared,const int&
|
||||
{
|
||||
int i, k;
|
||||
k = n - blockDim.x;
|
||||
for(i=0;i<k;i+=blockDim.x)
|
||||
{
|
||||
|
||||
for(i = 0; i < k; i += blockDim.x) {
|
||||
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
||||
}
|
||||
if(threadIdx.x<n-i)
|
||||
{
|
||||
|
||||
if(threadIdx.x < n - i) {
|
||||
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@ -140,28 +151,30 @@ static __device__ inline void copyGlobToShared(float* glob,float* shared,const i
|
||||
{
|
||||
int i, k;
|
||||
k = n - blockDim.x;
|
||||
for(i=0;i<k;i+=blockDim.x)
|
||||
{
|
||||
|
||||
for(i = 0; i < k; i += blockDim.x) {
|
||||
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
||||
}
|
||||
if(threadIdx.x<n-i)
|
||||
{
|
||||
|
||||
if(threadIdx.x < n - i) {
|
||||
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n)
|
||||
{
|
||||
int i;
|
||||
for(i=0;i<n-blockDim.x;i+=blockDim.x)
|
||||
{
|
||||
|
||||
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
|
||||
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
||||
}
|
||||
if(threadIdx.x<n-i)
|
||||
{
|
||||
|
||||
if(threadIdx.x < n - i) {
|
||||
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@ -170,14 +183,15 @@ static __device__ inline void copyData(double* source,double* target,const int&
|
||||
{
|
||||
int i;
|
||||
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
|
||||
for(i=0;i<n-blockDim.x*blockDim.y*blockDim.z;i+=blockDim.x*blockDim.y*blockDim.z)
|
||||
{
|
||||
|
||||
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
if(offset<n-i)
|
||||
{
|
||||
|
||||
if(offset < n - i) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@ -185,14 +199,15 @@ static __device__ inline void copyData(float* source,float* target,const int& n)
|
||||
{
|
||||
int i;
|
||||
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
|
||||
for(i=0;i<n-blockDim.x*blockDim.y*blockDim.z;i+=blockDim.x*blockDim.y*blockDim.z)
|
||||
{
|
||||
|
||||
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
if(offset<n-i)
|
||||
{
|
||||
|
||||
if(offset < n - i) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@ -200,14 +215,15 @@ static __device__ inline void copyData(int* source,int* target,const int& n)
|
||||
{
|
||||
int i;
|
||||
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
|
||||
for(i=0;i<n-blockDim.x*blockDim.y*blockDim.z;i+=blockDim.x*blockDim.y*blockDim.z)
|
||||
{
|
||||
|
||||
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
if(offset<n-i)
|
||||
{
|
||||
|
||||
if(offset < n - i) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@ -215,14 +231,15 @@ static __device__ inline void copyData(unsigned int* source,unsigned int* target
|
||||
{
|
||||
int i;
|
||||
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
|
||||
for(i=0;i<n-blockDim.x*blockDim.y*blockDim.z;i+=blockDim.x*blockDim.y*blockDim.z)
|
||||
{
|
||||
|
||||
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
if(offset<n-i)
|
||||
{
|
||||
|
||||
if(offset < n - i) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@ -232,10 +249,11 @@ static __device__ inline void copyData(unsigned int* source,unsigned int* target
|
||||
static __device__ inline void reduceBlockP2(int* data)
|
||||
{
|
||||
__syncthreads();
|
||||
for(int i=2;i<=blockDim.x;i*=2)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= blockDim.x; i *= 2) {
|
||||
if(threadIdx.x < blockDim.x / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -243,10 +261,11 @@ static __device__ inline void reduceBlockP2(int* data)
|
||||
static __device__ inline void reduceBlockP2(unsigned int* data)
|
||||
{
|
||||
__syncthreads();
|
||||
for(int i=2;i<=blockDim.x;i*=2)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= blockDim.x; i *= 2) {
|
||||
if(threadIdx.x < blockDim.x / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -254,10 +273,11 @@ static __device__ inline void reduceBlockP2(unsigned int* data)
|
||||
static __device__ inline void reduceBlockP2(float* data)
|
||||
{
|
||||
__syncthreads();
|
||||
for(int i=2;i<=blockDim.x;i*=2)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= blockDim.x; i *= 2) {
|
||||
if(threadIdx.x < blockDim.x / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -265,10 +285,11 @@ static __device__ inline void reduceBlockP2(float* data)
|
||||
static __device__ inline void reduceBlockP2(double* data)
|
||||
{
|
||||
__syncthreads();
|
||||
for(int i=2;i<=blockDim.x;i*=2)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= blockDim.x; i *= 2) {
|
||||
if(threadIdx.x < blockDim.x / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -277,15 +298,18 @@ static __device__ inline void reduceBlock(float* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2];
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2 / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -294,15 +318,18 @@ static __device__ inline void reduceBlock(int* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2];
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2 / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -311,15 +338,18 @@ static __device__ inline void reduceBlock(unsigned int* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2];
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2 / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -328,15 +358,18 @@ static __device__ inline void reduceBlock(double* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2];
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2 / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -344,20 +377,22 @@ static __device__ inline void reduceBlock(double* data)
|
||||
static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value)
|
||||
{
|
||||
int i;
|
||||
for(i=0;i<n-blockDim.x;i+=blockDim.x)
|
||||
{
|
||||
|
||||
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
|
||||
data[i + threadIdx.x] = value;
|
||||
}
|
||||
|
||||
if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
|
||||
}
|
||||
|
||||
static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value)
|
||||
{
|
||||
int i;
|
||||
for(i=0;i<n-blockDim.x;i+=blockDim.x)
|
||||
{
|
||||
|
||||
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
|
||||
data[i + threadIdx.x] = value;
|
||||
}
|
||||
|
||||
if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
|
||||
}
|
||||
|
||||
@ -365,22 +400,24 @@ static __device__ inline void reduce(float* data,int n) //cautious not sure if w
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < n) p2 *= 2;
|
||||
|
||||
int j = 0;
|
||||
while((threadIdx.x+blockDim.x*j)*2<n-p2)
|
||||
{
|
||||
|
||||
while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
|
||||
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
while((threadIdx.x+blockDim.x*j)<p2/i)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
||||
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -389,22 +426,24 @@ static __device__ inline void reduce(double* data,int n) //cautious not sure if
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < n) p2 *= 2;
|
||||
|
||||
int j = 0;
|
||||
while((threadIdx.x+blockDim.x*j)*2<n-p2)
|
||||
{
|
||||
|
||||
while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
|
||||
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
while((threadIdx.x+blockDim.x*j)<p2/i)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
||||
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -413,15 +452,18 @@ static __device__ inline void minOfBlock(float* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -430,15 +472,18 @@ static __device__ inline void maxOfBlock(float* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -447,15 +492,18 @@ static __device__ inline void minOfBlock(double* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -464,15 +512,18 @@ static __device__ inline void maxOfBlock(double* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -482,22 +533,24 @@ static __device__ inline void minOfData(double* data,int n) //cautious not sure
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < n) p2 *= 2;
|
||||
|
||||
int j = 0;
|
||||
while((threadIdx.x+blockDim.x*j)<n-p2)
|
||||
{
|
||||
|
||||
while((threadIdx.x + blockDim.x * j) < n - p2) {
|
||||
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
while((threadIdx.x+blockDim.x*j)<p2/i)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
||||
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -506,22 +559,24 @@ static __device__ inline void maxOfData(double* data,int n) //cautious not sure
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < n) p2 *= 2;
|
||||
|
||||
int j = 0;
|
||||
while((threadIdx.x+blockDim.x*j)<n-p2)
|
||||
{
|
||||
|
||||
while((threadIdx.x + blockDim.x * j) < n - p2) {
|
||||
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
while((threadIdx.x+blockDim.x*j)<p2/i)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
||||
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -530,22 +585,24 @@ static __device__ inline void minOfData(float* data,int n) //cautious not sure i
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < n) p2 *= 2;
|
||||
|
||||
int j = 0;
|
||||
while((threadIdx.x+blockDim.x*j)<n-p2)
|
||||
{
|
||||
|
||||
while((threadIdx.x + blockDim.x * j) < n - p2) {
|
||||
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
while((threadIdx.x+blockDim.x*j)<p2/i)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
||||
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -554,22 +611,24 @@ static __device__ inline void maxOfData(float* data,int n) //cautious not sure i
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < n) p2 *= 2;
|
||||
|
||||
int j = 0;
|
||||
while((threadIdx.x+blockDim.x*j)<n-p2)
|
||||
{
|
||||
|
||||
while((threadIdx.x + blockDim.x * j) < n - p2) {
|
||||
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
for(int i=2;i<=p2;i*=2)
|
||||
{
|
||||
while((threadIdx.x+blockDim.x*j)<p2/i)
|
||||
{
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
||||
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -601,8 +660,7 @@ inline void BindXTypeTexture(cuda_shared_data* sdata)
|
||||
_x_type_tex.normalized = false; // access with normalized texture coordinates
|
||||
_x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* x_type_texture_ptr;
|
||||
cudaGetTextureReference(&x_type_texture_ptr, MY_CONST(x_type_tex));
|
||||
const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex);
|
||||
|
||||
#if X_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
|
||||
@ -654,8 +712,7 @@ inline void BindVRadiusTexture(cuda_shared_data* sdata)
|
||||
_v_radius_tex.normalized = false; // access with normalized texture coordinates
|
||||
_v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* v_radius_texture_ptr;
|
||||
cudaGetTextureReference(&v_radius_texture_ptr, MY_CONST(v_radius_tex));
|
||||
const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex);
|
||||
|
||||
#if V_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
|
||||
@ -686,8 +743,7 @@ inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
|
||||
_omega_rmass_tex.normalized = false; // access with normalized texture coordinates
|
||||
_omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* omega_rmass_texture_ptr;
|
||||
cudaGetTextureReference(&omega_rmass_texture_ptr, MY_CONST(omega_rmass_tex));
|
||||
const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex);
|
||||
|
||||
#if V_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
|
||||
@ -739,8 +795,7 @@ inline void BindQTexture(cuda_shared_data* sdata)
|
||||
_q_tex.normalized = false; // access with normalized texture coordinates
|
||||
_q_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* q_texture_ptr;
|
||||
cudaGetTextureReference(&q_texture_ptr, MY_CONST(q_tex));
|
||||
const textureReference* q_texture_ptr = &MY_AP(q_tex);
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
|
||||
@ -776,7 +831,7 @@ inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
|
||||
_coeff_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_coeff_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* coeff_texture_ptr;
|
||||
cudaGetTextureReference(&coeff_texture_ptr, MY_CONST(coeff_tex));
|
||||
cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex));
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
|
||||
@ -803,7 +858,8 @@ static __device__ inline X_FLOAT4 fetchXType(int i)
|
||||
*/
|
||||
#define SBBITS 30
|
||||
|
||||
static inline __device__ int sbmask(int j) {
|
||||
static inline __device__ int sbmask(int j)
|
||||
{
|
||||
return j >> SBBITS & 3;
|
||||
}
|
||||
|
||||
@ -814,10 +870,12 @@ static inline __device__ void minimum_image(X_FLOAT4& delta)
|
||||
delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
|
||||
(delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
|
||||
}
|
||||
|
||||
if(_periodicity[1]) {
|
||||
delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
|
||||
(delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
|
||||
}
|
||||
|
||||
if(_periodicity[2]) {
|
||||
delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
|
||||
(delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
|
||||
@ -833,6 +891,7 @@ static inline __device__ void minimum_image(X_FLOAT4& delta)
|
||||
(delta.z > X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0));
|
||||
|
||||
}
|
||||
|
||||
if(_periodicity[1]) {
|
||||
delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
|
||||
(delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
|
||||
@ -840,6 +899,7 @@ static inline __device__ void minimum_image(X_FLOAT4& delta)
|
||||
(delta.y > X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0));
|
||||
|
||||
}
|
||||
|
||||
if(_periodicity[0]) {
|
||||
delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
|
||||
(delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
|
||||
|
||||
@ -194,8 +194,8 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag,int eflag_atom,int vfla
|
||||
void Cuda_UpdateBuffer(cuda_shared_data* sdata, int size)
|
||||
{
|
||||
CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles: before updateBuffer failed");
|
||||
if(sdata->buffersize<size)
|
||||
{
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
@ -203,30 +203,30 @@ void Cuda_UpdateBuffer(cuda_shared_data* sdata,int size)
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
}
|
||||
cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) );
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
|
||||
CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles failed");
|
||||
}
|
||||
|
||||
void Cuda_Pair_UpdateNeighbor_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
||||
{
|
||||
//Neighbor
|
||||
cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) );
|
||||
cudaMemcpyToSymbol(MY_CONST(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(overlap_comm) , & sdata->overlap_comm, sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(overlap_comm) , & sdata->overlap_comm, sizeof(int));
|
||||
|
||||
if(sdata->overlap_comm)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_CONST(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_CONST(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_CONST(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_CONST(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_CONST(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*) );
|
||||
if(sdata->overlap_comm) {
|
||||
cudaMemcpyToSymbol(MY_AP(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*));
|
||||
}
|
||||
|
||||
}
|
||||
@ -236,23 +236,23 @@ void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighli
|
||||
CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: Begin");
|
||||
|
||||
//System
|
||||
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
|
||||
//Atom
|
||||
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
|
||||
|
||||
//Other
|
||||
cudaMemcpyToSymbol(MY_CONST(debugdata) , & sdata->debugdata , sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata , sizeof(int*));
|
||||
CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: End");
|
||||
}
|
||||
|
||||
@ -269,16 +269,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
printf("# CUDA: Cuda_Pair_Init: you need %u types. this is more than %u "
|
||||
"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
|
||||
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE - 1);
|
||||
|
||||
if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params)
|
||||
exit(0);
|
||||
|
||||
//type conversion of cutoffs and parameters
|
||||
if(need_cut)
|
||||
{
|
||||
if(need_cut) {
|
||||
X_FLOAT cutsq[cuda_ntypes2];
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=1; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
|
||||
}
|
||||
}
|
||||
@ -286,60 +286,54 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
int cutsqdiffer = 0;
|
||||
X_FLOAT cutsq_global;
|
||||
cutsq_global = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
|
||||
if(sdata->pair.cut)
|
||||
{
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=i; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
if(sdata->pair.cut[i][j]>1e-6)
|
||||
{
|
||||
|
||||
if(sdata->pair.cut) {
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = i; j <= sdata->atom.ntypes; ++j) {
|
||||
if(sdata->pair.cut[i][j] > 1e-6) {
|
||||
cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
|
||||
cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
|
||||
}
|
||||
|
||||
if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j];
|
||||
|
||||
if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6)
|
||||
cutsqdiffer++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(sdata->pair.cutsq)
|
||||
{
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=i; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
if(sdata->pair.cut[i][j]>1e-6)
|
||||
{
|
||||
if(sdata->pair.cutsq) {
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = i; j <= sdata->atom.ntypes; ++j) {
|
||||
if(sdata->pair.cut[i][j] > 1e-6) {
|
||||
cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cutsq[i][j]);
|
||||
cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cutsq[i][j]);
|
||||
}
|
||||
|
||||
if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j];
|
||||
|
||||
if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6)
|
||||
cutsqdiffer++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//printf("CUTSQGLOB: %i %e\n",cutsqdiffer,cutsq_global);
|
||||
if(cutsqdiffer)
|
||||
{
|
||||
if(cutsqdiffer) {
|
||||
|
||||
cutsq_global = -1.0;
|
||||
cudaMemcpyToSymbol(MY_CONST(cutsq) , cutsq , nx );
|
||||
}
|
||||
cudaMemcpyToSymbol(MY_CONST(cutsq_global) ,&cutsq_global , sizeof(X_FLOAT) );
|
||||
cudaMemcpyToSymbol(MY_AP(cutsq) , cutsq , nx);
|
||||
}
|
||||
|
||||
if(need_innercut)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT));
|
||||
}
|
||||
|
||||
if(need_innercut) {
|
||||
X_FLOAT cut_innersq[cuda_ntypes2];
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=1; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
|
||||
}
|
||||
}
|
||||
@ -347,38 +341,36 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
int cutsqdiffer = 0;
|
||||
X_FLOAT cut_innersq_global;
|
||||
cut_innersq_global = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
|
||||
if(sdata->pair.cut_inner)
|
||||
{
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=i; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
if(sdata->pair.cut_inner[i][j]>1e-6)
|
||||
{
|
||||
|
||||
if(sdata->pair.cut_inner) {
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = i; j <= sdata->atom.ntypes; ++j) {
|
||||
if(sdata->pair.cut_inner[i][j] > 1e-6) {
|
||||
cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
|
||||
cut_innersq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
|
||||
}
|
||||
|
||||
if(i == 1 && j == 1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j];
|
||||
|
||||
if((cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) * (cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) > 1e-6)
|
||||
cutsqdiffer++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(cutsqdiffer)
|
||||
{
|
||||
|
||||
if(cutsqdiffer) {
|
||||
cut_innersq_global = -1.0;
|
||||
cudaMemcpyToSymbol(MY_CONST(cut_innersq) , cut_innersq , nx );
|
||||
}
|
||||
cudaMemcpyToSymbol(MY_CONST(cut_innersq_global) ,&cut_innersq_global , sizeof(X_FLOAT) );
|
||||
cudaMemcpyToSymbol(MY_AP(cut_innersq) , cut_innersq , nx);
|
||||
}
|
||||
|
||||
if(need_q)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(cut_innersq_global) , &cut_innersq_global , sizeof(X_FLOAT));
|
||||
}
|
||||
|
||||
if(need_q) {
|
||||
X_FLOAT cut_coulsq[cuda_ntypes2];
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=1; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
|
||||
}
|
||||
}
|
||||
@ -386,54 +378,52 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
int cutsqdiffer = 0;
|
||||
X_FLOAT cut_coulsq_global;
|
||||
cut_coulsq_global = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
|
||||
|
||||
if(sdata->pair.cut_coulsq_global > cut_coulsq_global) cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global;
|
||||
if(sdata->pair.cut_coul)
|
||||
{
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=i; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
if(sdata->pair.cut_coul[i][j]>1e-6)
|
||||
{
|
||||
|
||||
if(sdata->pair.cut_coul) {
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = i; j <= sdata->atom.ntypes; ++j) {
|
||||
if(sdata->pair.cut_coul[i][j] > 1e-6) {
|
||||
cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
|
||||
cut_coulsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
|
||||
}
|
||||
|
||||
if(i == 1 && j == 1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j];
|
||||
|
||||
if((cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) * (cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) > 1e-6)
|
||||
cutsqdiffer++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(cutsqdiffer)
|
||||
{
|
||||
|
||||
if(cutsqdiffer) {
|
||||
cut_coulsq_global = -1.0;
|
||||
cudaMemcpyToSymbol(MY_CONST(cut_coulsq) , cut_coulsq , nx );
|
||||
cudaMemcpyToSymbol(MY_AP(cut_coulsq) , cut_coulsq , nx);
|
||||
}
|
||||
cudaMemcpyToSymbol(MY_CONST(cut_coulsq_global),&cut_coulsq_global , sizeof(X_FLOAT) );
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global , sizeof(X_FLOAT));
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed");
|
||||
|
||||
if(ncoeff>0)
|
||||
{
|
||||
if(ncoeff > 0) {
|
||||
F_FLOAT coeff1[cuda_ntypes2];
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=1; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_FLOAT*) );
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpy((sdata->pair.coeff1_gm.dev_data), coeff1, n, cudaMemcpyHostToDevice);
|
||||
|
||||
_coeff1_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff1_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_coeff1_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* coeff1_gm_texture_ptr;
|
||||
cudaGetTextureReference(&coeff1_gm_texture_ptr, MY_CONST(coeff1_gm_tex));
|
||||
const textureReference* coeff1_gm_texture_ptr = &MY_AP(coeff1_gm_tex);
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 a failed");
|
||||
|
||||
#if F_PRECISION == 1
|
||||
@ -448,33 +438,29 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c-d failed");
|
||||
#endif
|
||||
|
||||
}
|
||||
else
|
||||
} else
|
||||
cudaMemcpyToSymbol(MY_AP(coeff1), coeff1 , n);
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed");
|
||||
|
||||
if(ncoeff>1)
|
||||
{
|
||||
if(ncoeff > 1) {
|
||||
F_FLOAT coeff2[cuda_ntypes2];
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=1; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_FLOAT*) );
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n, cudaMemcpyHostToDevice);
|
||||
|
||||
_coeff2_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff2_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_coeff2_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* coeff2_gm_texture_ptr;
|
||||
cudaGetTextureReference(&coeff2_gm_texture_ptr, MY_CONST(coeff2_gm_tex));
|
||||
const textureReference* coeff2_gm_texture_ptr = &MY_AP(coeff2_gm_tex);
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
@ -484,32 +470,28 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
|
||||
#endif
|
||||
|
||||
}
|
||||
else
|
||||
} else
|
||||
cudaMemcpyToSymbol(MY_AP(coeff2), coeff2 , n);
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed");
|
||||
|
||||
if(ncoeff>2)
|
||||
{
|
||||
if(ncoeff > 2) {
|
||||
F_FLOAT coeff3[cuda_ntypes2];
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=1; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_FLOAT*) );
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n, cudaMemcpyHostToDevice);
|
||||
_coeff3_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff3_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_coeff3_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* coeff3_gm_texture_ptr;
|
||||
cudaGetTextureReference(&coeff3_gm_texture_ptr, MY_CONST(coeff3_gm_tex));
|
||||
const textureReference* coeff3_gm_texture_ptr = &MY_AP(coeff3_gm_tex);
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
@ -518,32 +500,28 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
|
||||
#endif
|
||||
}
|
||||
else
|
||||
} else
|
||||
cudaMemcpyToSymbol(MY_AP(coeff3), coeff3 , n);
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed");
|
||||
|
||||
if(ncoeff>3)
|
||||
{
|
||||
if(ncoeff > 3) {
|
||||
F_FLOAT coeff4[cuda_ntypes2];
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=1; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_FLOAT*) );
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n, cudaMemcpyHostToDevice);
|
||||
_coeff4_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff4_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_coeff4_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* coeff4_gm_texture_ptr;
|
||||
cudaGetTextureReference(&coeff4_gm_texture_ptr, MY_CONST(coeff4_gm_tex));
|
||||
const textureReference* coeff4_gm_texture_ptr = &MY_AP(coeff4_gm_tex);
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
@ -552,32 +530,28 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
|
||||
#endif
|
||||
}
|
||||
else
|
||||
} else
|
||||
cudaMemcpyToSymbol(MY_AP(coeff4), coeff4 , n);
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed");
|
||||
|
||||
if(ncoeff>4)
|
||||
{
|
||||
if(ncoeff > 4) {
|
||||
F_FLOAT coeff5[cuda_ntypes2];
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=1; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_FLOAT*) );
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n, cudaMemcpyHostToDevice);
|
||||
_coeff5_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff5_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_coeff5_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* coeff5_gm_texture_ptr;
|
||||
cudaGetTextureReference(&coeff5_gm_texture_ptr, MY_CONST(coeff5_gm_tex));
|
||||
const textureReference* coeff5_gm_texture_ptr = &MY_AP(coeff5_gm_tex);
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
@ -586,32 +560,28 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
|
||||
#endif
|
||||
}
|
||||
else
|
||||
} else
|
||||
cudaMemcpyToSymbol(MY_AP(coeff5), coeff5 , n);
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed");
|
||||
if(ncoeff>5)
|
||||
{
|
||||
|
||||
if(ncoeff > 5) {
|
||||
F_FLOAT coeff6[cuda_ntypes2];
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=1; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_FLOAT*) );
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n, cudaMemcpyHostToDevice);
|
||||
_coeff6_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff6_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_coeff6_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* coeff6_gm_texture_ptr;
|
||||
cudaGetTextureReference(&coeff6_gm_texture_ptr, MY_CONST(coeff6_gm_tex));
|
||||
const textureReference* coeff6_gm_texture_ptr = &MY_AP(coeff6_gm_tex);
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
@ -622,28 +592,25 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed");
|
||||
|
||||
if(ncoeff>6)
|
||||
{
|
||||
if(ncoeff > 6) {
|
||||
F_FLOAT coeff7[cuda_ntypes2];
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=1; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_FLOAT*) );
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n, cudaMemcpyHostToDevice);
|
||||
_coeff7_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff7_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_coeff7_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* coeff7_gm_texture_ptr;
|
||||
cudaGetTextureReference(&coeff7_gm_texture_ptr, MY_CONST(coeff7_gm_tex));
|
||||
const textureReference* coeff7_gm_texture_ptr = &MY_AP(coeff7_gm_tex);
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
@ -654,28 +621,25 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed");
|
||||
|
||||
if(ncoeff>7)
|
||||
{
|
||||
if(ncoeff > 7) {
|
||||
F_FLOAT coeff8[cuda_ntypes2];
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=1; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_FLOAT*) );
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n, cudaMemcpyHostToDevice);
|
||||
_coeff8_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff8_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_coeff8_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* coeff8_gm_texture_ptr;
|
||||
cudaGetTextureReference(&coeff8_gm_texture_ptr, MY_CONST(coeff8_gm_tex));
|
||||
const textureReference* coeff8_gm_texture_ptr = &MY_AP(coeff8_gm_tex);
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
@ -686,28 +650,25 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed");
|
||||
|
||||
if(ncoeff>8)
|
||||
{
|
||||
if(ncoeff > 8) {
|
||||
F_FLOAT coeff9[cuda_ntypes2];
|
||||
for(int i=1; i<=sdata->atom.ntypes; ++i)
|
||||
{
|
||||
for(int j=1; j<=sdata->atom.ntypes; ++j)
|
||||
{
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_FLOAT*) );
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n, cudaMemcpyHostToDevice);
|
||||
_coeff9_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff9_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_coeff9_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* coeff9_gm_texture_ptr;
|
||||
cudaGetTextureReference(&coeff9_gm_texture_ptr, MY_CONST(coeff9_gm_tex));
|
||||
const textureReference* coeff9_gm_texture_ptr = &MY_AP(coeff9_gm_tex);
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
@ -718,6 +679,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed");
|
||||
|
||||
F_FLOAT special_lj[4];
|
||||
@ -727,23 +689,21 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
special_lj[3] = sdata->pair.special_lj[3];
|
||||
|
||||
|
||||
X_FLOAT box_size[3] =
|
||||
{
|
||||
X_FLOAT box_size[3] = {
|
||||
sdata->domain.subhi[0] - sdata->domain.sublo[0],
|
||||
sdata->domain.subhi[1] - sdata->domain.sublo[1],
|
||||
sdata->domain.subhi[2] - sdata->domain.sublo[2]
|
||||
};
|
||||
|
||||
cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3);
|
||||
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) );
|
||||
cudaMemcpyToSymbol(MY_CONST(special_lj) , special_lj , sizeof(F_FLOAT)*4);
|
||||
cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 );
|
||||
cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(special_lj) , special_lj , sizeof(F_FLOAT) * 4);
|
||||
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
|
||||
|
||||
if(need_q)
|
||||
{
|
||||
if(need_q) {
|
||||
F_FLOAT qqrd2e_tmp = sdata->pppm.qqrd2e;
|
||||
F_FLOAT special_coul[4];
|
||||
special_coul[0] = sdata->pair.special_coul[0];
|
||||
@ -751,12 +711,13 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q=f
|
||||
special_coul[2] = sdata->pair.special_coul[2];
|
||||
special_coul[3] = sdata->pair.special_coul[3];
|
||||
|
||||
cudaMemcpyToSymbol(MY_CONST(special_coul) , special_coul , sizeof(F_FLOAT)*4);
|
||||
cudaMemcpyToSymbol(MY_CONST(g_ewald) ,&sdata->pair.g_ewald , sizeof(F_FLOAT) );
|
||||
cudaMemcpyToSymbol(MY_CONST(qqrd2e) ,&qqrd2e_tmp , sizeof(F_FLOAT) );
|
||||
cudaMemcpyToSymbol(MY_CONST(kappa) ,&sdata->pair.kappa , sizeof(F_FLOAT) );
|
||||
cudaMemcpyToSymbol(MY_CONST(eng_coul) ,&sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_AP(special_coul) , special_coul , sizeof(F_FLOAT) * 4);
|
||||
cudaMemcpyToSymbol(MY_AP(g_ewald) , &sdata->pair.g_ewald , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(qqrd2e) , &qqrd2e_tmp , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(kappa) , &sdata->pair.kappa , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_coul) , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init failed");
|
||||
}
|
||||
timespec startpairtime, endpairtime;
|
||||
@ -764,37 +725,56 @@ timespec startpairtime, endpairtime;
|
||||
void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, dim3 &grid, dim3 &threads, int &sharedperproc, bool need_q = false, int maxthreads = 256)
|
||||
{
|
||||
if(sdata->atom.nlocal == 0) return;
|
||||
|
||||
if(sdata->atom.update_neigh)
|
||||
Cuda_Pair_UpdateNeighbor_AllStyles(sdata, sneighlist);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_Pair_UpdateNmax_AllStyles(sdata, sneighlist);
|
||||
if(sdata->atom.update_nlocal)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) );
|
||||
|
||||
if(sdata->atom.update_nlocal) {
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
|
||||
}
|
||||
|
||||
|
||||
|
||||
BindXTypeTexture(sdata);
|
||||
|
||||
if(need_q) BindQTexture(sdata);
|
||||
|
||||
|
||||
sharedperproc = 0;
|
||||
|
||||
if(sdata->pair.use_block_per_atom) sharedperproc += 3;
|
||||
|
||||
if(eflag) sharedperproc += 1;
|
||||
|
||||
if(need_q && eflag) sharedperproc += 1;
|
||||
|
||||
if(vflag) sharedperproc += 6;
|
||||
|
||||
int threadnum = sneighlist->inum;
|
||||
|
||||
if(sdata->comm.comm_phase == 2)threadnum = sneighlist->inum_border2;
|
||||
if(sdata->pair.use_block_per_atom) {threadnum*=64; maxthreads=64;}
|
||||
|
||||
if(sdata->pair.use_block_per_atom) {
|
||||
threadnum *= 64;
|
||||
maxthreads = 64;
|
||||
}
|
||||
|
||||
int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_FLOAT), maxthreads, true); //need to limit to 192 threads due to register limit
|
||||
threads.x = layout.z; threads.y = 1; threads.z = 1;
|
||||
grid.x = layout.x; grid.y = layout.y; grid.z = 1;
|
||||
threads.x = layout.z;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
grid.x = layout.x;
|
||||
grid.y = layout.y;
|
||||
grid.z = 1;
|
||||
|
||||
int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_FLOAT);
|
||||
|
||||
if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_FLOAT));
|
||||
|
||||
Cuda_UpdateBuffer(sdata, size);
|
||||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
@ -802,6 +782,7 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis
|
||||
|
||||
sdata->pair.lastgridsize = grid.x * grid.y;
|
||||
sdata->pair.n_energy_virial = sharedperproc;
|
||||
|
||||
if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial -= 3;
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &startpairtime);
|
||||
@ -812,21 +793,21 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis
|
||||
//Function which is called after the kernel invocation, collects energy and virial
|
||||
void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sharedperproc, int eflag, int vflag)
|
||||
{
|
||||
if((not sdata->pair.collect_forces_later) && (eflag||vflag))//not sdata->comm.comm_phase==2))
|
||||
{
|
||||
if((not sdata->pair.collect_forces_later) && (eflag || vflag)) { //not sdata->comm.comm_phase==2))
|
||||
cudaThreadSynchronize();
|
||||
clock_gettime(CLOCK_REALTIME, &endpairtime);
|
||||
sdata->cuda_timings.pair_kernel +=
|
||||
endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000;
|
||||
CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
|
||||
|
||||
if(eflag||vflag)
|
||||
{
|
||||
if(eflag || vflag) {
|
||||
int n = grid.x * grid.y;
|
||||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
grid.x = sharedperproc - 3;
|
||||
else
|
||||
grid.x = sharedperproc;
|
||||
|
||||
grid.y = 1;
|
||||
dim3 threads(128, 1, 1);
|
||||
MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
|
||||
@ -878,20 +859,20 @@ void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3& grid, int& sh
|
||||
void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
CUT_CHECK_ERROR("Cuda_Pair: before updateNmax failed");
|
||||
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(radius) , & sdata->atom.radius .dev_data, sizeof(X_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(omega) , & sdata->atom.omega .dev_data, sizeof(V_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(omega_rmass),& sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(map_array), & sdata->atom.map_array .dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*));
|
||||
CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed");
|
||||
}
|
||||
|
||||
@ -899,13 +880,15 @@ void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata)
|
||||
void Cuda_Pair_GenerateXType(cuda_shared_data* sdata)
|
||||
{
|
||||
MYDBG(printf(" # CUDA: GenerateXType ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);)
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_Pair_UpdateNmax(sdata);
|
||||
if(sdata->atom.update_nlocal)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) );
|
||||
|
||||
if(sdata->atom.update_nlocal) {
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
|
||||
}
|
||||
|
||||
MYDBG(printf(" # CUDA: GenerateXType ... getgrid\n"); fflush(stdout);)
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nall);
|
||||
@ -922,10 +905,12 @@ void Cuda_Pair_GenerateXType(cuda_shared_data* sdata)
|
||||
void Cuda_Pair_RevertXType(cuda_shared_data* sdata)
|
||||
{
|
||||
MYDBG(printf(" # CUDA: RevertXType ... start\n");)
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_Pair_UpdateNmax(sdata);
|
||||
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) );
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nall);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
@ -940,10 +925,12 @@ void Cuda_Pair_RevertXType(cuda_shared_data* sdata)
|
||||
void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata)
|
||||
{
|
||||
MYDBG(printf(" # CUDA: GenerateVRadius ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);)
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_Pair_UpdateNmax(sdata);
|
||||
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) );
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
|
||||
MYDBG(printf(" # CUDA: GenerateVRadius ... getgrid\n"); fflush(stdout);)
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nall);
|
||||
@ -960,10 +947,12 @@ void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata)
|
||||
void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata)
|
||||
{
|
||||
MYDBG(printf(" # CUDA: GenerateOmegaRmass ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);)
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_Pair_UpdateNmax(sdata);
|
||||
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) );
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
|
||||
MYDBG(printf(" # CUDA: GenerateOmegaRmass ... getgrid\n"); fflush(stdout);)
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nall);
|
||||
@ -981,8 +970,9 @@ void Cuda_Pair_BuildXHold(cuda_shared_data* sdata)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_Pair_UpdateNmax(sdata);
|
||||
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) );
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nall);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
@ -1003,8 +993,7 @@ void Cuda_Pair_CollectForces(cuda_shared_data* sdata,int eflag, int vflag)
|
||||
dim3 threads;
|
||||
dim3 grid;
|
||||
|
||||
if(eflag||vflag)
|
||||
{
|
||||
if(eflag || vflag) {
|
||||
int n = sdata->pair.lastgridsize;
|
||||
grid.x = sdata->pair.n_energy_virial;
|
||||
grid.y = 1;
|
||||
@ -1014,6 +1003,7 @@ void Cuda_Pair_CollectForces(cuda_shared_data* sdata,int eflag, int vflag)
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed");
|
||||
}
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
threads.x = layout.z;
|
||||
grid.x = layout.x;
|
||||
|
||||
@ -86,8 +86,7 @@ inline void BindEAMTextures(cuda_shared_data* sdata)
|
||||
_rhor_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_rhor_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
|
||||
const textureReference* rhor_spline_texture_ptr;
|
||||
cudaGetTextureReference(&rhor_spline_texture_ptr, MY_CONST(rhor_spline_tex));
|
||||
const textureReference* rhor_spline_texture_ptr = &MY_AP(rhor_spline_tex);
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<float4>();
|
||||
@ -101,8 +100,7 @@ inline void BindEAMTextures(cuda_shared_data* sdata)
|
||||
_z2r_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_z2r_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
|
||||
const textureReference* z2r_spline_texture_ptr;
|
||||
cudaGetTextureReference(&z2r_spline_texture_ptr, MY_CONST(z2r_spline_tex));
|
||||
const textureReference* z2r_spline_texture_ptr = &MY_AP(z2r_spline_tex);
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<float4>();
|
||||
@ -121,42 +119,45 @@ void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlis
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_FLOAT);
|
||||
if(sdata->buffersize<size)
|
||||
{
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
||||
if(sdata->buffer != NULL) cudaFree(sdata->buffer);
|
||||
|
||||
cudaMalloc((void**)&sdata->buffer, size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
}
|
||||
cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) );
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed");
|
||||
}
|
||||
|
||||
void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) );
|
||||
cudaMemcpyToSymbol(MY_CONST(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int));
|
||||
}
|
||||
|
||||
void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
||||
{
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed");
|
||||
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed");
|
||||
}
|
||||
|
||||
@ -168,53 +169,59 @@ int* type2frho,int** type2z2r,int** type2rhor)
|
||||
// !! LAMMPS indexes atom types starting with 1 !!
|
||||
|
||||
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
|
||||
|
||||
if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2)
|
||||
printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u "
|
||||
"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 "
|
||||
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
|
||||
|
||||
unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes;
|
||||
|
||||
X_FLOAT cutsq_global;
|
||||
cutsq_global = (X_FLOAT)(sdata->pair.cut_global);
|
||||
cudaMemcpyToSymbol(MY_CONST(cutsq_global) ,&cutsq_global , sizeof(X_FLOAT) );
|
||||
cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT));
|
||||
|
||||
|
||||
F_FLOAT* coeff_buf = new F_FLOAT[cuda_ntypes * cuda_ntypes];
|
||||
|
||||
for(int i = 0; i < cuda_ntypes; i++) coeff_buf[i] = type2frho[i];
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes * sizeof(F_FLOAT));
|
||||
|
||||
for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2rhor[0][0])[i];
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(coeff2) , coeff_buf , nI);
|
||||
|
||||
for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2z2r[0][0])[i];
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(coeff3) , coeff_buf , nI);
|
||||
|
||||
delete [] coeff_buf;
|
||||
X_FLOAT box_size[3] =
|
||||
{
|
||||
X_FLOAT box_size[3] = {
|
||||
sdata->domain.subhi[0] - sdata->domain.sublo[0],
|
||||
sdata->domain.subhi[1] - sdata->domain.sublo[1],
|
||||
sdata->domain.subhi[2] - sdata->domain.sublo[2]
|
||||
};
|
||||
F_FLOAT rdr_F = rdr;
|
||||
F_FLOAT rdrho_F = rdrho;
|
||||
cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3);
|
||||
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes), & cuda_ntypes , sizeof(unsigned) );
|
||||
cudaMemcpyToSymbol(MY_CONST(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity, sizeof(int)*3 );
|
||||
cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(rdr), &rdr_F, sizeof(F_FLOAT) );
|
||||
cudaMemcpyToSymbol(MY_CONST(rdrho), &rdrho_F, sizeof(F_FLOAT) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nr), &nr, sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nrho), &nrho, sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nfrho), &nfrho, sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_CONST(rho), &rho, sizeof(F_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(fp), &fp, sizeof(F_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(frho_spline), &frho_spline, sizeof(F_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(rhor_spline), &rhor_spline, sizeof(F_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(z2r_spline), &z2r_spline, sizeof(F_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(nrhor), &nrhor, sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(nr), &nr, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nrho), &nrho, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nfrho), &nfrho, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
|
||||
|
||||
rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
|
||||
z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
|
||||
@ -232,17 +239,23 @@ void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_PairEAMCuda_UpdateNmax(sdata, sneighlist);
|
||||
|
||||
if(sdata->atom.update_neigh)
|
||||
Cuda_PairEAMCuda_UpdateNeighbor(sdata, sneighlist);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_PairEAMCuda_UpdateBuffer(sdata, sneighlist);
|
||||
cudaMemcpyToSymbol(MY_CONST(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*) );
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
|
||||
int sharedperproc = 0;
|
||||
|
||||
if(eflag || eflag_atom) sharedperproc = 1;
|
||||
|
||||
if(vflag || vflag_atom) sharedperproc = 7;
|
||||
|
||||
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
|
||||
@ -270,8 +283,11 @@ void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
|
||||
void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
|
||||
{
|
||||
int sharedperproc = 0;
|
||||
|
||||
if(eflag || eflag_atom) sharedperproc = 1;
|
||||
|
||||
if(vflag || vflag_atom) sharedperproc = 7;
|
||||
|
||||
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
@ -289,8 +305,7 @@ void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed");
|
||||
|
||||
if(eflag||vflag)
|
||||
{
|
||||
if(eflag || vflag) {
|
||||
int n = grid.x * grid.y;
|
||||
grid.x = sharedperproc;
|
||||
grid.y = 1;
|
||||
|
||||
@ -34,23 +34,22 @@ __device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR*MANYBODY_NPAIR*MA
|
||||
void Cuda_PairSWCuda_Init(cuda_shared_data* sdata, ParamSW_Float* params_host, void* map_host, void* elem2param_host, int nelements_h)
|
||||
{
|
||||
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
|
||||
X_FLOAT box_size[3] =
|
||||
{
|
||||
X_FLOAT box_size[3] = {
|
||||
sdata->domain.subhi[0] - sdata->domain.sublo[0],
|
||||
sdata->domain.subhi[1] - sdata->domain.sublo[1],
|
||||
sdata->domain.subhi[2] - sdata->domain.sublo[2]
|
||||
};
|
||||
|
||||
cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3);
|
||||
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) );
|
||||
cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 );
|
||||
cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) );
|
||||
cudaMemcpyToSymbol("params_sw", params_host , sizeof(ParamSW_Float)*nelements_h*nelements_h*nelements_h );
|
||||
cudaMemcpyToSymbol("elem2param",elem2param_host , sizeof(int)*nelements_h*nelements_h*nelements_h );
|
||||
cudaMemcpyToSymbol("map",map_host , sizeof(int)*cuda_ntypes );
|
||||
cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
|
||||
cudaMemcpyToSymbol(params_sw, params_host , sizeof(ParamSW_Float)*nelements_h * nelements_h * nelements_h);
|
||||
cudaMemcpyToSymbol(elem2param, elem2param_host , sizeof(int)*nelements_h * nelements_h * nelements_h);
|
||||
cudaMemcpyToSymbol(map, map_host , sizeof(int)*cuda_ntypes);
|
||||
cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int));
|
||||
}
|
||||
|
||||
void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
|
||||
@ -61,8 +60,7 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,
|
||||
static int* glob_neighbors_red = NULL;
|
||||
static int* glob_neightype_red = NULL;
|
||||
|
||||
if(glob_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT))
|
||||
{
|
||||
if(glob_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) {
|
||||
glob_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT);
|
||||
cudaFree(glob_r_ij);
|
||||
cudaFree(glob_numneigh_red);
|
||||
@ -72,11 +70,12 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,
|
||||
cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int));
|
||||
cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
|
||||
cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
|
||||
cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red , sizeof(int*) );
|
||||
cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red , sizeof(int*) );
|
||||
cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red , sizeof(int*) );
|
||||
cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij , sizeof(F_FLOAT4*) );
|
||||
cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*));
|
||||
cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*));
|
||||
cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*));
|
||||
cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*));
|
||||
}
|
||||
|
||||
dim3 grid, threads;
|
||||
int sharedperproc;
|
||||
|
||||
@ -86,6 +85,7 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,
|
||||
|
||||
|
||||
dim3 grid2;
|
||||
|
||||
if(sdata->atom.nall <= 256 * 64000) {
|
||||
grid2.x = (sdata->atom.nall + 255) / 256;
|
||||
grid2.y = 1;
|
||||
@ -93,6 +93,7 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,
|
||||
grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128);
|
||||
grid2.y = 128;
|
||||
}
|
||||
|
||||
grid2.z = 1;
|
||||
dim3 threads2;
|
||||
threads2.x = 256;
|
||||
@ -112,17 +113,15 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,
|
||||
|
||||
//actual force calculation
|
||||
unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
|
||||
if(eflag)
|
||||
{
|
||||
|
||||
if(eflag) {
|
||||
if(vflag)
|
||||
Pair_SW_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>>
|
||||
(eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_SW_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>>
|
||||
(eflag_atom, vflag_atom);
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
if(vflag)
|
||||
Pair_SW_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>>
|
||||
(eflag_atom, vflag_atom);
|
||||
|
||||
@ -39,24 +39,23 @@ __device__ __constant__ bool _zbl; //is tersoff zbl?
|
||||
void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host, void* map_host, void* elem2param_host, int nelements_h, bool zbl)
|
||||
{
|
||||
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
|
||||
X_FLOAT box_size[3] =
|
||||
{
|
||||
X_FLOAT box_size[3] = {
|
||||
sdata->domain.subhi[0] - sdata->domain.sublo[0],
|
||||
sdata->domain.subhi[1] - sdata->domain.sublo[1],
|
||||
sdata->domain.subhi[2] - sdata->domain.sublo[2]
|
||||
};
|
||||
|
||||
cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3);
|
||||
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) );
|
||||
cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 );
|
||||
cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) );
|
||||
cudaMemcpyToSymbol("params", params_host , sizeof(Param_Float)*nelements_h*nelements_h*nelements_h );
|
||||
cudaMemcpyToSymbol("elem2param",elem2param_host , sizeof(int)*nelements_h*nelements_h*nelements_h );
|
||||
cudaMemcpyToSymbol("map",map_host , sizeof(int)*cuda_ntypes );
|
||||
cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int));
|
||||
cudaMemcpyToSymbol("_zbl",&zbl,sizeof(bool));
|
||||
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
|
||||
cudaMemcpyToSymbol(params, params_host , sizeof(Param_Float)*nelements_h * nelements_h * nelements_h);
|
||||
cudaMemcpyToSymbol(elem2param, elem2param_host , sizeof(int)*nelements_h * nelements_h * nelements_h);
|
||||
cudaMemcpyToSymbol(map, map_host , sizeof(int)*cuda_ntypes);
|
||||
cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int));
|
||||
cudaMemcpyToSymbol(_zbl, &zbl, sizeof(bool));
|
||||
|
||||
}
|
||||
|
||||
@ -69,8 +68,7 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh
|
||||
static int* glob_neighbors_red = NULL;
|
||||
static int* glob_neightype_red = NULL;
|
||||
|
||||
if(glob_zeta_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT))
|
||||
{
|
||||
if(glob_zeta_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) {
|
||||
glob_zeta_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT);
|
||||
cudaFree(glob_zeta_ij);
|
||||
cudaFree(glob_r_ij);
|
||||
@ -82,12 +80,13 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh
|
||||
cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int));
|
||||
cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
|
||||
cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
|
||||
cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red , sizeof(int*) );
|
||||
cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red , sizeof(int*) );
|
||||
cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red , sizeof(int*) );
|
||||
cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij , sizeof(F_FLOAT4*) );
|
||||
cudaMemcpyToSymbol("_glob_zeta_ij", &glob_zeta_ij , sizeof(F_FLOAT*) );
|
||||
cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*));
|
||||
cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*));
|
||||
cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*));
|
||||
cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*));
|
||||
cudaMemcpyToSymbol(_glob_zeta_ij, &glob_zeta_ij , sizeof(F_FLOAT*));
|
||||
}
|
||||
|
||||
dim3 grid, threads;
|
||||
int sharedperproc;
|
||||
|
||||
@ -97,6 +96,7 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh
|
||||
|
||||
|
||||
dim3 grid2;
|
||||
|
||||
if(sdata->atom.nall <= 256 * 64000) {
|
||||
grid2.x = (sdata->atom.nall + 255) / 256;
|
||||
grid2.y = 1;
|
||||
@ -104,6 +104,7 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh
|
||||
grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128);
|
||||
grid2.y = 128;
|
||||
}
|
||||
|
||||
grid2.z = 1;
|
||||
dim3 threads2;
|
||||
threads2.x = 256;
|
||||
@ -127,17 +128,15 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh
|
||||
|
||||
//actual force calculation
|
||||
unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
|
||||
if(eflag)
|
||||
{
|
||||
|
||||
if(eflag) {
|
||||
if(vflag)
|
||||
Pair_Tersoff_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>>
|
||||
(eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Tersoff_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>>
|
||||
(eflag_atom, vflag_atom);
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
if(vflag)
|
||||
Pair_Tersoff_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>>
|
||||
(eflag_atom, vflag_atom);
|
||||
|
||||
@ -100,51 +100,51 @@ void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_b
|
||||
)
|
||||
{
|
||||
CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start");
|
||||
cudaMemcpyToSymbol("density_brick",&cu_density_brick, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol("density_brick_int",&cu_density_brick_int, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol("vdx_brick",&cu_vdx_brick, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol("vdy_brick",&cu_vdy_brick, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol("vdz_brick",&cu_vdz_brick, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol("density_fft",&cu_density_fft, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol("energy",&cu_energy, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol("virial",&cu_virial, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol("nxlo_in",&cu_nxlo_in, sizeof(int));
|
||||
cudaMemcpyToSymbol("nxhi_in",&cu_nxhi_in, sizeof(int));
|
||||
cudaMemcpyToSymbol("nxlo_out",&cu_nxlo_out, sizeof(int));
|
||||
cudaMemcpyToSymbol("nxhi_out",&cu_nxhi_out, sizeof(int));
|
||||
cudaMemcpyToSymbol("nylo_in",&cu_nylo_in, sizeof(int));
|
||||
cudaMemcpyToSymbol("nyhi_in",&cu_nyhi_in, sizeof(int));
|
||||
cudaMemcpyToSymbol("nylo_out",&cu_nylo_out, sizeof(int));
|
||||
cudaMemcpyToSymbol("nyhi_out",&cu_nyhi_out, sizeof(int));
|
||||
cudaMemcpyToSymbol("nzlo_in",&cu_nzlo_in, sizeof(int));
|
||||
cudaMemcpyToSymbol("nzhi_in",&cu_nzhi_in, sizeof(int));
|
||||
cudaMemcpyToSymbol("nzlo_out",&cu_nzlo_out, sizeof(int));
|
||||
cudaMemcpyToSymbol("nzhi_out",&cu_nzhi_out, sizeof(int));
|
||||
cudaMemcpyToSymbol("nxlo_fft",&cu_nxlo_fft, sizeof(int));
|
||||
cudaMemcpyToSymbol("nxhi_fft",&cu_nxhi_fft, sizeof(int));
|
||||
cudaMemcpyToSymbol("nylo_fft",&cu_nylo_fft, sizeof(int));
|
||||
cudaMemcpyToSymbol("nyhi_fft",&cu_nyhi_fft, sizeof(int));
|
||||
cudaMemcpyToSymbol("nzlo_fft",&cu_nzlo_fft, sizeof(int));
|
||||
cudaMemcpyToSymbol("nzhi_fft",&cu_nzhi_fft, sizeof(int));
|
||||
cudaMemcpyToSymbol("slabflag",&cu_slabflag, sizeof(int));
|
||||
cudaMemcpyToSymbol("nx_pppm",&cu_nx_pppm, sizeof(int));
|
||||
cudaMemcpyToSymbol("ny_pppm",&cu_ny_pppm, sizeof(int));
|
||||
cudaMemcpyToSymbol("nz_pppm",&cu_nz_pppm, sizeof(int));
|
||||
cudaMemcpyToSymbol("work1",&cu_work1, sizeof(FFT_FLOAT*));
|
||||
cudaMemcpyToSymbol("work2",&cu_work2, sizeof(FFT_FLOAT*));
|
||||
cudaMemcpyToSymbol("work3",&cu_work3, sizeof(FFT_FLOAT*));
|
||||
cudaMemcpyToSymbol("greensfn",&cu_greensfn, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol("gf_b",&cu_gf_b, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol("fkx",&cu_fkx, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol("fky",&cu_fky, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol("fkz",&cu_fkz, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol("vg",&cu_vg, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(density_brick, &cu_density_brick, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(density_brick_int, &cu_density_brick_int, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(vdx_brick, &cu_vdx_brick, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(vdy_brick, &cu_vdy_brick, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(vdz_brick, &cu_vdz_brick, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(density_fft, &cu_density_fft, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(energy, &cu_energy, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(virial, &cu_virial, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(nxlo_in, &cu_nxlo_in, sizeof(int));
|
||||
cudaMemcpyToSymbol(nxhi_in, &cu_nxhi_in, sizeof(int));
|
||||
cudaMemcpyToSymbol(nxlo_out, &cu_nxlo_out, sizeof(int));
|
||||
cudaMemcpyToSymbol(nxhi_out, &cu_nxhi_out, sizeof(int));
|
||||
cudaMemcpyToSymbol(nylo_in, &cu_nylo_in, sizeof(int));
|
||||
cudaMemcpyToSymbol(nyhi_in, &cu_nyhi_in, sizeof(int));
|
||||
cudaMemcpyToSymbol(nylo_out, &cu_nylo_out, sizeof(int));
|
||||
cudaMemcpyToSymbol(nyhi_out, &cu_nyhi_out, sizeof(int));
|
||||
cudaMemcpyToSymbol(nzlo_in, &cu_nzlo_in, sizeof(int));
|
||||
cudaMemcpyToSymbol(nzhi_in, &cu_nzhi_in, sizeof(int));
|
||||
cudaMemcpyToSymbol(nzlo_out, &cu_nzlo_out, sizeof(int));
|
||||
cudaMemcpyToSymbol(nzhi_out, &cu_nzhi_out, sizeof(int));
|
||||
cudaMemcpyToSymbol(nxlo_fft, &cu_nxlo_fft, sizeof(int));
|
||||
cudaMemcpyToSymbol(nxhi_fft, &cu_nxhi_fft, sizeof(int));
|
||||
cudaMemcpyToSymbol(nylo_fft, &cu_nylo_fft, sizeof(int));
|
||||
cudaMemcpyToSymbol(nyhi_fft, &cu_nyhi_fft, sizeof(int));
|
||||
cudaMemcpyToSymbol(nzlo_fft, &cu_nzlo_fft, sizeof(int));
|
||||
cudaMemcpyToSymbol(nzhi_fft, &cu_nzhi_fft, sizeof(int));
|
||||
cudaMemcpyToSymbol(slabflag, &cu_slabflag, sizeof(int));
|
||||
cudaMemcpyToSymbol(nx_pppm, &cu_nx_pppm, sizeof(int));
|
||||
cudaMemcpyToSymbol(ny_pppm, &cu_ny_pppm, sizeof(int));
|
||||
cudaMemcpyToSymbol(nz_pppm, &cu_nz_pppm, sizeof(int));
|
||||
cudaMemcpyToSymbol(work1, &cu_work1, sizeof(FFT_FLOAT*));
|
||||
cudaMemcpyToSymbol(work2, &cu_work2, sizeof(FFT_FLOAT*));
|
||||
cudaMemcpyToSymbol(work3, &cu_work3, sizeof(FFT_FLOAT*));
|
||||
cudaMemcpyToSymbol(greensfn, &cu_greensfn, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(gf_b, &cu_gf_b, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(fkx, &cu_fkx, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(fky, &cu_fky, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(fkz, &cu_fkz, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(vg, &cu_vg, sizeof(PPPM_FLOAT*));
|
||||
|
||||
PPPM_FLOAT cu_qqrd2e_a = cu_qqrd2e;
|
||||
cudaMemcpyToSymbol("qqrd2e",&cu_qqrd2e_a, sizeof(PPPM_FLOAT));
|
||||
cudaMemcpyToSymbol("order",&cu_order, sizeof(int));
|
||||
cudaMemcpyToSymbol("rho_coeff",&cu_rho_coeff, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol("debugdata",&cu_debugdata, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(qqrd2e, &cu_qqrd2e_a, sizeof(PPPM_FLOAT));
|
||||
cudaMemcpyToSymbol(order, &cu_order, sizeof(int));
|
||||
cudaMemcpyToSymbol(rho_coeff, &cu_rho_coeff, sizeof(PPPM_FLOAT*));
|
||||
cudaMemcpyToSymbol(debugdata, &cu_debugdata, sizeof(PPPM_FLOAT*));
|
||||
|
||||
CUT_CHECK_ERROR("ERROR-CUDA poisson_init");
|
||||
|
||||
@ -174,35 +174,35 @@ if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precis
|
||||
|
||||
void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_FLOAT cu_shiftone, PPPM_FLOAT cu_delxinv, PPPM_FLOAT cu_delyinv, PPPM_FLOAT cu_delzinv, int cu_nlower, int cu_nupper)
|
||||
{
|
||||
cudaMemcpyToSymbol("delxinv",&cu_delxinv, sizeof(PPPM_FLOAT));
|
||||
cudaMemcpyToSymbol("delyinv",&cu_delyinv, sizeof(PPPM_FLOAT));
|
||||
cudaMemcpyToSymbol("delzinv",&cu_delzinv, sizeof(PPPM_FLOAT));
|
||||
cudaMemcpyToSymbol("shiftone",&cu_shiftone, sizeof(PPPM_FLOAT));
|
||||
cudaMemcpyToSymbol("nlower",&cu_nlower, sizeof(int));
|
||||
cudaMemcpyToSymbol("nupper",&cu_nupper, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo, 3*sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi, 3*sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_CONST(boxlo) , sdata->domain.boxlo, 3*sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(delxinv, &cu_delxinv, sizeof(PPPM_FLOAT));
|
||||
cudaMemcpyToSymbol(delyinv, &cu_delyinv, sizeof(PPPM_FLOAT));
|
||||
cudaMemcpyToSymbol(delzinv, &cu_delzinv, sizeof(PPPM_FLOAT));
|
||||
cudaMemcpyToSymbol(shiftone, &cu_shiftone, sizeof(PPPM_FLOAT));
|
||||
cudaMemcpyToSymbol(nlower, &cu_nlower, sizeof(int));
|
||||
cudaMemcpyToSymbol(nupper, &cu_nupper, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo, 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi, 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo, 3 * sizeof(X_FLOAT));
|
||||
CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup");
|
||||
}
|
||||
|
||||
void pppm_device_update(cuda_shared_data* sdata, void* cu_part2grid, int nlocala, int nmaxa)
|
||||
{
|
||||
cudaMemcpyToSymbol("part2grid", &cu_part2grid, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
//cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int));
|
||||
cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int));
|
||||
cudaMemcpyToSymbol("nmax" , &nmaxa, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
//cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int));
|
||||
cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int));
|
||||
cudaMemcpyToSymbol(nmax , &nmaxa, sizeof(int));
|
||||
CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update");
|
||||
|
||||
}
|
||||
|
||||
void pppm_update_nlocal(int nlocala)
|
||||
{
|
||||
cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int));
|
||||
cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int));
|
||||
CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b");
|
||||
}
|
||||
|
||||
@ -389,6 +389,7 @@ ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppm
|
||||
CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 ");
|
||||
|
||||
cudaMemcpy((void*)(&host_energy), cu_energy, sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
|
||||
|
||||
if(vflag)
|
||||
cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6 * sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
|
||||
CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy");
|
||||
@ -408,8 +409,8 @@ void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_int
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
int sharedmemsize = (32 + 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT);
|
||||
do
|
||||
{
|
||||
|
||||
do {
|
||||
cpu_flag[0] = 0;
|
||||
cpu_flag[1] = 0;
|
||||
cpu_flag[2] = 0;
|
||||
@ -425,8 +426,15 @@ void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_int
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("ERROR-CUDA make_rho A");
|
||||
cudaMemcpy((void*) &cpu_flag, flag, 3 * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if(cpu_flag[0]!=0) {(*cu_density_intScale)/=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n",*cu_density_intScale);)}
|
||||
if((cpu_flag[0]==0)&&(cpu_flag[1]==0)) {(*cu_density_intScale)*=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n",*cu_density_intScale);)}
|
||||
|
||||
if(cpu_flag[0] != 0) {
|
||||
(*cu_density_intScale) /= 2;
|
||||
MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n", *cu_density_intScale);)
|
||||
}
|
||||
if((cpu_flag[0] == 0) && (cpu_flag[1] == 0)) {
|
||||
(*cu_density_intScale) *= 2;
|
||||
MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n", *cu_density_intScale);)
|
||||
}
|
||||
/* if((*cu_density_intScale)>0xe0000000)
|
||||
{
|
||||
printf("Error Scaling\n");
|
||||
@ -496,6 +504,7 @@ double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_F
|
||||
cudaMemcpy((void*) buf, dev_buf, grid.x* sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
|
||||
|
||||
double dipole_all = 0.0;
|
||||
|
||||
for(int i = 0; i < grid.x; i++)
|
||||
dipole_all += buf[i];
|
||||
|
||||
|
||||
Reference in New Issue
Block a user